From 84a56429ed8588523a5251f02b5b66ab30fabfdb Mon Sep 17 00:00:00 2001 From: Sebb Date: Wed, 22 Feb 2023 15:30:46 +0000 Subject: [PATCH 1/4] Initial stab at improving folding --- .github/workflows/test.yml | 16 ++++---- lib/mail/fields/unstructured_field.rb | 18 +++++---- spec/mail/encodings_spec.rb | 32 +++++++++++---- spec/mail/fields/unstructured_field_spec.rb | 43 +++++++++++++++++++-- 4 files changed, 83 insertions(+), 26 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 237c9c150..e71bd4395 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,14 +32,14 @@ jobs: timeout: 50 - ruby: truffleruby-head timeout: 50 - - ruby: jruby - timeout: 5 - - ruby: jruby-head - timeout: 5 - - ruby: jruby-9.4 - timeout: 5 - - ruby: jruby-9.3 - timeout: 5 + # - ruby: jruby + # timeout: 5 + # - ruby: jruby-head + # timeout: 5 + # - ruby: jruby-9.4 + # timeout: 5 + # - ruby: jruby-9.3 + # timeout: 5 - ruby: jruby-9.2 timeout: 5 steps: diff --git a/lib/mail/fields/unstructured_field.rb b/lib/mail/fields/unstructured_field.rb index 53e27c2c9..bef3a9d34 100644 --- a/lib/mail/fields/unstructured_field.rb +++ b/lib/mail/fields/unstructured_field.rb @@ -99,31 +99,35 @@ def wrap_lines(name, folded_lines) end def fold(prepend = 0) # :nodoc: + # prepend is the length to allow for the header prefix on the first line (e.g. 'Subject: ') encoding = normalized_encoding + encoding_overhead = "=?#{encoding}?Q??=".length + # The encoded string goes here ^ (between the ??) + max_safe_word = 78 - encoding_overhead - 10 # allow for encoding overhead + prefix decoded_string = decoded.to_s - should_encode = !decoded_string.ascii_only? + words = decoded_string.split(/[ \t]/) + should_encode = !decoded_string.ascii_only? || words.any? {|word| word.length > max_safe_word} if should_encode + max_safe_re = Regexp.new(".{#{max_safe_word}}|.+$") first = true - words = decoded_string.split(/[ \t]/).map do |word| + words = words.map do |word| if first first = !first else word = " #{word}" end - if !word.ascii_only? - word + if word.ascii_only? + word.scan(max_safe_re) else word.scan(/.{7}|.+$/) end end.flatten - else - words = decoded_string.split(/[ \t]/) end folded_lines = [] while !words.empty? limit = 78 - prepend - limit = limit - 7 - encoding.length if should_encode + limit = limit - encoding_overhead if should_encode line = String.new first_word = true while !words.empty? diff --git a/spec/mail/encodings_spec.rb b/spec/mail/encodings_spec.rb index 7428b9383..4c74d6e17 100644 --- a/spec/mail/encodings_spec.rb +++ b/spec/mail/encodings_spec.rb @@ -283,18 +283,20 @@ expect(Mail::Encodings.value_decode(string)).to eq result end - it "should not fold a long string that has no spaces" do + it "should fold a long non-ascii string that has no spaces" do original = "ВосстановлениеВосстановлениеВашегопароля" if original.respond_to?(:force_encoding) original = original.dup.force_encoding('UTF-8') - result = "Subject: =?UTF-8?Q?=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=B0=D1=88=D0=B5=D0=B3=D0=BE=D0=BF=D0=B0=D1=80=D0=BE=D0=BB=D1=8F?=\r\n" - else - result = "Subject: =?UTF-8?Q?=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=B0=D1=88=D0=B5=D0=B3=D0=BE=D0=BF=D0=B0=D1=80=D0=BE=D0=BB=D1=8F?=\r\n" end + result = "Subject: =?UTF-8?Q?=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=BE=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=92=D0=B0=D1=88=D0=B5=D0=B3=D0=BE=D0=BF=D0=B0=D1=80=D0=BE=D0=BB=D1=8F?=\r\n" mail = Mail.new mail.subject = original expect(mail[:subject].decoded).to eq original - expect(mail[:subject].encoded).to eq result + encoded = mail[:subject].encoded + # check lines are not too long + encoded.split("\r\n").each { |line| expect(line.length).to be <= 78 } + # check expected contents, ignoring folding + expect(encoded.gsub("?=\r\n =?UTF-8?Q?", '')).to eq result end it "should round trip a complex string properly" do @@ -306,13 +308,27 @@ mail = Mail.new mail.subject = original expect(mail[:subject].decoded).to eq original - expect(mail[:subject].encoded).to eq result + encoded = mail[:subject].encoded + # check lines are not too long + encoded.split("\r\n").each { |line| expect(line.length).to be <= 78 } + # check expected contents, ignoring folding + expect(encoded.gsub("?=\r\n =?UTF-8?Q?", '')).to eq result.gsub("?=\r\n =?UTF-8?Q?", '') + mail = Mail.new(mail.encoded) expect(mail[:subject].decoded).to eq original - expect(mail[:subject].encoded).to eq result + encoded = mail[:subject].encoded + # check lines are not too long + encoded.split("\r\n").each { |line| expect(line.length).to be <= 78 } + # check expected contents, ignoring folding + expect(encoded.gsub("?=\r\n =?UTF-8?Q?", '')).to eq result.gsub("?=\r\n =?UTF-8?Q?", '') + mail = Mail.new(mail.encoded) expect(mail[:subject].decoded).to eq original - expect(mail[:subject].encoded).to eq result + encoded = mail[:subject].encoded + # check lines are not too long + encoded.split("\r\n").each { |line| expect(line.length).to be <= 78 } + # check expected contents, ignoring folding + expect(encoded.gsub("?=\r\n =?UTF-8?Q?", '')).to eq result.gsub("?=\r\n =?UTF-8?Q?", '') end it "should round trip another complex string (koi-8)" do diff --git a/spec/mail/fields/unstructured_field_spec.rb b/spec/mail/fields/unstructured_field_spec.rb index af3407dad..a4434d377 100644 --- a/spec/mail/fields/unstructured_field_spec.rb +++ b/spec/mail/fields/unstructured_field_spec.rb @@ -149,8 +149,8 @@ @field = Mail::UnstructuredField.new("Subject", string) string = string.dup.force_encoding('UTF-8') result = "Subject: =?UTF-8?Q?This_is_=E3=81=82_really_long_string_This_is_=E3=81=82?=\r\n\s=?UTF-8?Q?_really_long_string_This_is_=E3=81=82_really_long_string_This_is?=\r\n\s=?UTF-8?Q?_=E3=81=82_really_long_string_This_is_=E3=81=82_really_long?=\r\n\s=?UTF-8?Q?_string?=\r\n" - expect(@field.encoded).to eq result expect(@field.decoded).to eq string + expect(@field.encoded).to eq result end it "should fold properly with my actual complicated header" do @@ -158,16 +158,53 @@ @field = Mail::UnstructuredField.new("X-SMTPAPI", string) string = string.dup.force_encoding('UTF-8') result = "X-SMTPAPI: =?UTF-8?Q?{=22unique=5Fargs=22:_{=22mailing=5Fid=22:147,=22a?=\r\n =?UTF-8?Q?ccount=5Fid=22:2},_=22to=22:_[=22larspind@gmail.com=22],_=22categ?=\r\n =?UTF-8?Q?ory=22:_=22mailing=22,_=22filters=22:_{=22domainkeys=22:_{=22sett?=\r\n =?UTF-8?Q?ings=22:_{=22domain=22:1,=22enable=22:1}}},_=22sub=22:_{=22{{op?=\r\n =?UTF-8?Q?en=5Fimage=5Furl}}=22:_[=22http://betaling.larspind.local/O?=\r\n =?UTF-8?Q?/token/147/Mailing::FakeRecipient=22],_=22{{name}}=22:_[=22[FIRST?=\r\n =?UTF-8?Q?_NAME]=22],_=22{{signup=5Freminder}}=22:_[=22=28her_kommer_til_at?=\r\n =?UTF-8?Q?_st=C3=A5_hvorn=C3=A5r_folk_har_skrevet_sig_op_...=29=22],?=\r\n =?UTF-8?Q?_=22{{unsubscribe=5Furl}}=22:_[=22http://betaling.larspind.?=\r\n =?UTF-8?Q?local/U/token/147/Mailing::FakeRecipient=22],_=22{{email}}=22:?=\r\n =?UTF-8?Q?_[=22larspind@gmail.com=22],_=22{{link:308}}=22:_[=22http://beta?=\r\n =?UTF-8?Q?ling.larspind.local/L/308/0/Mailing::FakeRecipient=22],_=22{{con?=\r\n =?UTF-8?Q?firm=5Furl}}=22:_[=22=22],_=22{{ref}}=22:_[=22[REF]=22]}}?=\r\n" - expect(@field.encoded).to eq result expect(@field.decoded).to eq string + encoded = @field.encoded + # check lines are not too long + encoded.split("\r\n").each { |line| expect(line.length).to be <= 78 } + # check expected contents, ignoring folding + expect(encoded.gsub("?=\r\n =?UTF-8?Q?", '')).to eq result.gsub("?=\r\n =?UTF-8?Q?", '') end it "should fold properly with continuous spaces around the linebreak" do - @field = Mail::UnstructuredField.new("Subject", "This is a header that has continuous spaces around line break point, which should be folded properly") + subject = "This is a header that has continuous spaces around line break point, which should be folded properly" + @field = Mail::UnstructuredField.new("Subject", subject) result = "Subject: This is a header that has continuous spaces around line break point,\s\r\n\s\s\s\swhich should be folded properly\r\n" + expect(@field.decoded).to eq subject expect(@field.encoded).to eq result end + it "should fold an ASCII-only subject with more than 78 characters and no white space" do + value = "12345678901234567890123456789012345678901234567890123456789012345678901234567890" + expect(value.length).to be > 78 - "Subject: ".length + @field = Mail::UnstructuredField.new("Subject", value) + lines = @field.encoded.split("\r\n") + lines.each { |line| expect(line.length).to be <= 78 } + end + + it "should fold an ASCII-only subject with more than 998 characters and no white space" do + value = "ThisIsASubjectHeaderMessageThatIsGoingToBeMoreThan998CharactersLong." * 20 + @field = Mail::UnstructuredField.new("Subject", value) + lines = @field.encoded.split("\r\n") + lines.each { |line| expect(line.length).to be <= 78 } + end + + # TODO: tweak unstructured_field to always generate lines of at most 78 chars + it "should fold a Japanese subject with more than 998 characters long and no white space" do + value = "これは非常に長い日本語のSubjectです。空白はありません。" * 1 + @field = Mail::UnstructuredField.new("Subject", value) + lines = @field.encoded.split("\r\n") + lines.each { |line| expect(line.length).to be <= 90 } + end + + # TODO: tweak unstructured_field to always generate lines of at most 78 chars + it "should fold full of emoji subject that is going to be more than 998 bytes unfolded" do + value = "😄" * 90 + @field = Mail::UnstructuredField.new("Subject", value) + lines = @field.encoded.split("\r\n") + lines.each { |line| expect(line.length).to be < 110 } + end + end describe "encoding non QP safe chars" do From e000cb2d0770793920e276a0826b0948665d85b6 Mon Sep 17 00:00:00 2001 From: Sebb Date: Wed, 22 Feb 2023 15:59:28 +0000 Subject: [PATCH 2/4] Add some comment [skip CI] --- lib/mail/fields/unstructured_field.rb | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/mail/fields/unstructured_field.rb b/lib/mail/fields/unstructured_field.rb index bef3a9d34..9095c0dcb 100644 --- a/lib/mail/fields/unstructured_field.rb +++ b/lib/mail/fields/unstructured_field.rb @@ -98,6 +98,14 @@ def wrap_lines(name, folded_lines) result.join("\r\n\s") end + # folding strategy: + # The line is split into words separated by space or tab. + # If the line contains any non-ascii characters, or any words are expected to be too long to fit, + # then they are additionally split, and the line is marked for encoding + # + # The list of individual words is then used to fill up the output lines without overflowing. + # This is not always guaranteed to work, because there is a wide variation in the number of + # characters that are needed to encode a given character. def fold(prepend = 0) # :nodoc: # prepend is the length to allow for the header prefix on the first line (e.g. 'Subject: ') encoding = normalized_encoding @@ -144,9 +152,6 @@ def fold(prepend = 0) # :nodoc: word = encode_crlf(word) # Skip to next line if we're going to go past the limit # Unless this is the first word, in which case we're going to add it anyway - # Note: This means that a word that's longer than 998 characters is going to break the spec. Please fix if this is a problem for you. - # (The fix, it seems, would be to use encoded-word encoding on it, because that way you can break it across multiple lines and - # the linebreak will be ignored) break if !line.empty? && (line.length + word.length + 1 > limit) # Remove the word from the queue ... words.shift @@ -160,7 +165,7 @@ def fold(prepend = 0) # :nodoc: # ... add it in encoded form to the current line line << word end - # Encode the line if necessary + # Mark the line as encoded if necessary line = "=?#{encoding}?Q?#{line}?=" if should_encode # Add the line to the output and reset the prepend folded_lines << line From 382590a6da2eca75c68e35fc07667d205b549629 Mon Sep 17 00:00:00 2001 From: Sebb Date: Wed, 22 Feb 2023 22:31:52 +0000 Subject: [PATCH 3/4] Re-work overlong lines --- lib/mail/fields/unstructured_field.rb | 25 ++++++++++++++++----- spec/mail/fields/unstructured_field_spec.rb | 16 +++++++++---- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/lib/mail/fields/unstructured_field.rb b/lib/mail/fields/unstructured_field.rb index 9095c0dcb..537d4d637 100644 --- a/lib/mail/fields/unstructured_field.rb +++ b/lib/mail/fields/unstructured_field.rb @@ -105,7 +105,8 @@ def wrap_lines(name, folded_lines) # # The list of individual words is then used to fill up the output lines without overflowing. # This is not always guaranteed to work, because there is a wide variation in the number of - # characters that are needed to encode a given character. + # characters that are needed to encode a given character. If the resulting line would be too + # long, divide the original word into two chunks and add the pieces separately. def fold(prepend = 0) # :nodoc: # prepend is the length to allow for the header prefix on the first line (e.g. 'Subject: ') encoding = normalized_encoding @@ -115,6 +116,7 @@ def fold(prepend = 0) # :nodoc: decoded_string = decoded.to_s words = decoded_string.split(/[ \t]/) should_encode = !decoded_string.ascii_only? || words.any? {|word| word.length > max_safe_word} + encoding_overhead = 0 unless should_encode if should_encode max_safe_re = Regexp.new(".{#{max_safe_word}}|.+$") first = true @@ -134,13 +136,14 @@ def fold(prepend = 0) # :nodoc: folded_lines = [] while !words.empty? - limit = 78 - prepend - limit = limit - encoding_overhead if should_encode + limit = 78 - prepend - encoding_overhead line = String.new first_word = true while !words.empty? break unless word = words.first.dup + original_word = word # in case we need to try again + # Convert on 1.9+ only since we aren't sure of the current # charset encoding on 1.8. We'd need to track internal/external # charset on each field. @@ -152,17 +155,29 @@ def fold(prepend = 0) # :nodoc: word = encode_crlf(word) # Skip to next line if we're going to go past the limit # Unless this is the first word, in which case we're going to add it anyway - break if !line.empty? && (line.length + word.length + 1 > limit) + break if !line.empty? && (line.length + word.length >= limit) # Remove the word from the queue ... words.shift # Add word separator if first_word first_word = false else - line << " " if !should_encode + line << " " unless should_encode end # ... add it in encoded form to the current line + # but first check if we have overflowed + # should only happen for the first word on a line + if should_encode && (line.length + word.length > limit) + word, remain = original_word.scan(/.{3}|.+$/) # roughly half the original split + words.unshift remain # put the unused bit back + # re-encode shorter word + if charset && word.respond_to?(:encoding) + word = Encodings.transcode_charset(word, word.encoding, charset) + end + word = encode(word) + word = encode_crlf(word) + end line << word end # Mark the line as encoded if necessary diff --git a/spec/mail/fields/unstructured_field_spec.rb b/spec/mail/fields/unstructured_field_spec.rb index a4434d377..1fddab608 100644 --- a/spec/mail/fields/unstructured_field_spec.rb +++ b/spec/mail/fields/unstructured_field_spec.rb @@ -178,6 +178,7 @@ value = "12345678901234567890123456789012345678901234567890123456789012345678901234567890" expect(value.length).to be > 78 - "Subject: ".length @field = Mail::UnstructuredField.new("Subject", value) + expect(@field.decoded).to eq value lines = @field.encoded.split("\r\n") lines.each { |line| expect(line.length).to be <= 78 } end @@ -185,6 +186,7 @@ it "should fold an ASCII-only subject with more than 998 characters and no white space" do value = "ThisIsASubjectHeaderMessageThatIsGoingToBeMoreThan998CharactersLong." * 20 @field = Mail::UnstructuredField.new("Subject", value) + expect(@field.decoded).to eq value lines = @field.encoded.split("\r\n") lines.each { |line| expect(line.length).to be <= 78 } end @@ -193,16 +195,18 @@ it "should fold a Japanese subject with more than 998 characters long and no white space" do value = "これは非常に長い日本語のSubjectです。空白はありません。" * 1 @field = Mail::UnstructuredField.new("Subject", value) + expect(@field.decoded).to eq value lines = @field.encoded.split("\r\n") - lines.each { |line| expect(line.length).to be <= 90 } + lines.each { |line| expect(line.length).to be <= 78 } end # TODO: tweak unstructured_field to always generate lines of at most 78 chars it "should fold full of emoji subject that is going to be more than 998 bytes unfolded" do value = "😄" * 90 @field = Mail::UnstructuredField.new("Subject", value) + expect(@field.decoded).to eq value lines = @field.encoded.split("\r\n") - lines.each { |line| expect(line.length).to be < 110 } + lines.each { |line| expect(line.length).to be <= 78 } end end @@ -216,9 +220,13 @@ end describe "iso-2022-jp Subject" do - it "should encoded with ISO-2022-JP encoding" do - @field = Mail::UnstructuredField.new("Subject", "あいうえお") + it "should be encoded with ISO-2022-JP encoding" do + value = "あいうえお" + @field = Mail::UnstructuredField.new("Subject", value) @field.charset = 'iso-2022-jp' + expect(@field.decoded).to eq value + lines = @field.encoded.split("\r\n") + lines.each { |line| expect(line.length).to be <= 78 } expect(@field.encoded).to eq "Subject: =?ISO-2022-JP?Q?=1B$B$=22$$$&$=28$*=1B=28B?=\r\n" end end From 7a7b8e729313b5a622fbf7b7ce8173debf5e8e8d Mon Sep 17 00:00:00 2001 From: sebbASF Date: Thu, 23 Feb 2023 08:35:53 +0000 Subject: [PATCH 4/4] Use actual prefix length --- lib/mail/fields/unstructured_field.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mail/fields/unstructured_field.rb b/lib/mail/fields/unstructured_field.rb index 537d4d637..cb578533b 100644 --- a/lib/mail/fields/unstructured_field.rb +++ b/lib/mail/fields/unstructured_field.rb @@ -112,7 +112,7 @@ def fold(prepend = 0) # :nodoc: encoding = normalized_encoding encoding_overhead = "=?#{encoding}?Q??=".length # The encoded string goes here ^ (between the ??) - max_safe_word = 78 - encoding_overhead - 10 # allow for encoding overhead + prefix + max_safe_word = 78 - encoding_overhead - prepend # allow for encoding overhead + prefix decoded_string = decoded.to_s words = decoded_string.split(/[ \t]/) should_encode = !decoded_string.ascii_only? || words.any? {|word| word.length > max_safe_word}