[Bps-public-commit] email-address-list branch, master, updated. 4341cba0fe0fb1207f914bf83f26e9bec1142bb4

Ruslan Zakirov ruz at bestpractical.com
Fri Nov 16 15:20:12 EST 2012


The branch, master has been updated
       via  4341cba0fe0fb1207f914bf83f26e9bec1142bb4 (commit)
       via  2763bb517f1671c72e8f6a8daa5752c617055514 (commit)
       via  2a6c01ebd65be19e679eee9254cd184bef14d0ec (commit)
      from  9bf352154206c4272a357cf241de42b1761cbf09 (commit)

Summary of changes:
 README                           | 119 ++++++++++------
 lib/Email/Address/List.pm        | 286 +++++++++++++++++++++++++++------------
 t/basics.t                       |   3 +-
 t/data/RFC5233.single.obs.json   |  44 ++++++
 t/data/RFC5233.single.obs.txt    |   5 +-
 t/data/RFC5233.single.valid.json |  49 ++++++-
 t/data/RFC5233.single.valid.txt  |  12 +-
 t/generate.pl                    |   9 +-
 t/invalid.t                      |  28 ++++
 t/random.combinations.t          |  60 ++++++++
 t/single.suit.t                  |  28 ++++
 11 files changed, 500 insertions(+), 143 deletions(-)
 create mode 100644 t/invalid.t
 create mode 100644 t/random.combinations.t
 create mode 100644 t/single.suit.t

- Log -----------------------------------------------------------------
commit 2a6c01ebd65be19e679eee9254cd184bef14d0ec
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Wed Nov 14 19:31:11 2012 +0400

    mass update
    
    * simplify post processing
    * full support of obsolete syntax
    * dequote phrase and local-part
    * normalize phrase and local-part quoting

diff --git a/lib/Email/Address/List.pm b/lib/Email/Address/List.pm
index 6d2078f..7977ce7 100644
--- a/lib/Email/Address/List.pm
+++ b/lib/Email/Address/List.pm
@@ -71,6 +71,7 @@ $RE{'quoted_pair'}    = qr/\\$RE{'text'}/;
 $RE{'atext'}          = qr/[^$RE{'CTL'}$RE{'special'}\s]/;
 $RE{'ctext'}          = qr/(?>[^()\\]+)/;
 $RE{'qtext'}          = qr/[^\\"]/;
+$RE{'dtext'}          = qr/[^\[\]\\]/;
 
 ($RE{'ccontent'}, $RE{'comment'}) = (q{})x2;
 for (1 .. $COMMENT_NEST_LEVEL) {
@@ -82,62 +83,53 @@ $RE{'cfws'}           = qr/$RE{'comment'}|\s+/;
 $RE{'qcontent'}       = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
 $RE{'quoted_string'}  = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
 
-# by the spec:
-# word            =   atom / quoted-string   = qr/$RE{'atom'}|$RE{'quoted_string'}/;
-# atom            =   [CFWS] 1*atext [CFWS]  = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
-# with some inlining:
-# word            = qr/$RE{'cfws'}* (?:$RE{'atom'} | "$RE{'qcontent'}+") $RE{'cfws'}*/x;
-# however:
-# phrase          =   1*word / obs-phrase
-# obs-phrase      =   word *(word / "." / CFWS)
-# after combining:
-# phrase          =   word *(word / "." / CFWS)
-
 $RE{'atom'}           = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
-$CRE{'atom'}           = qr/($RE{'cfws'}*)($RE{'atext'}+)($RE{'cfws'}*)/;
 
-$RE{'word'}           = qr/$RE{'cfws'}* (?: $RE{'atom'} |      "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
-$RE{'dword'}          = qr/$RE{'cfws'}* (?: $RE{'atom'} | \. | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
-$CRE{'dword'}         = qr/($RE{'cfws'}*) (?: ($RE{'atom'} | \.) | "($RE{'qcontent'}+)" ) ($RE{'cfws'}*)/x;
-$RE{'phrase'}         = qr/$RE{'word'} $RE{'dword'}*/x;
-$RE{'display_name'}   = $RE{'phrase'};
+$RE{'word'}           = qr/$RE{'cfws'}* (?: $RE{'atom'} | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
+$RE{'phrase'}         = qr/$RE{'word'}+/x;
+$RE{'display-name'}   = $RE{'phrase'};
 
 $RE{'dot_atom_text'}  = qr/$RE{'atext'}+(?:\.$RE{'atext'}+)*/;
 $RE{'dot_atom'}       = qr/$RE{'cfws'}*$RE{'dot_atom_text'}$RE{'cfws'}*/;
-$RE{'local_part'}     = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
+$RE{'local-part'}     = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
 
-$RE{'dtext'}          = qr/[^\[\]\\]/;
 $RE{'dcontent'}       = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
 $RE{'domain_literal'} = qr/$RE{'cfws'}*\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}*/;
 $RE{'domain'}         = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
-$CRE{'domain'}        = qr/
-    ($RE{'cfws'}*)
-    ($RE{'dot_atom_text'}|\[(?:\s*$RE{'dcontent'})*\s*\])
-    ($RE{'cfws'}*)
-/x;
+$RE{'obs-domain'}     = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
 
-$RE{'addr_spec'}      = qr/$RE{'local_part'}\@$RE{'domain'}/;
-$CRE{'addr_spec'}     = qr/
-    ($RE{'cfws'}*)
-    ($RE{'dot_atom_text'}|"$RE{'qcontent'}+")
-    ($RE{'cfws'}*)
-    \@$CRE{'domain'}
+$RE{'addr-spec'}      = qr/$RE{'local-part'}\@$RE{'domain'}/;
+$RE{'angle-addr'}     = qr/$RE{'cfws'}* < $RE{'addr-spec'} > $RE{'cfws'}*/x;
+
+$RE{'name-addr'}      = qr/$RE{'display-name'}?$RE{'angle-addr'}/;
+$RE{'mailbox'}        = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/;
+
+$CRE{'addr-spec'}      = qr/($RE{'local-part'})\@($RE{'domain'})/;
+$CRE{'mailbox'} = qr/
+    (?:
+        ($RE{'display-name'})?($RE{'cfws'}*)<$CRE{'addr-spec'}>($RE{'cfws'}*)
+        |$CRE{'addr-spec'}
+    )($RE{'comment'}*)
 /x;
-$RE{'obs-route'}      = qr/
+
+$RE{'dword'}            = qr/$RE{'cfws'}* (?: $RE{'atom'} | \. | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
+$RE{'obs-phrase'}       = qr/$RE{'word'} $RE{'dword'}*/x;
+$RE{'obs-display-name'} = $RE{'obs-phrase'};
+$RE{'obs-route'}        = qr/
     (?:$RE{'cfws'}|,)*
     \@$RE{'domain'}
     (?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)*
     :
 /x;
-$RE{'angle_addr'}     = qr/$RE{'cfws'}* < $RE{'obs-route'}? $RE{'addr_spec'} > $RE{'cfws'}*/x;
-
-$RE{'name_addr'}      = qr/$RE{'display_name'}?$RE{'angle_addr'}/;
-$RE{'mailbox'}        = qr/(?:$RE{'name_addr'}|$RE{'addr_spec'})$RE{'comment'}*/;
-
-$CRE{'mailbox'} = qr/
+$RE{'obs-domain'}       = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
+$RE{'obs-local-part'}   = qr/$RE{'word'}(?:\.$RE{'word'})*/;
+$RE{'obs-addr-spec'}    = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/;
+$CRE{'obs-addr-spec'}   = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/;
+$CRE{'obs-mailbox'} = qr/
     (?:
-        ($RE{'display_name'})?($RE{'cfws'}*)< $RE{'obs-route'}? ($RE{'addr_spec'})>($RE{'cfws'}*)
-        |($RE{'addr_spec'})
+        ($RE{'obs-display-name'})?
+        ($RE{'cfws'}*)< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'}*)
+        |$CRE{'obs-addr-spec'}
     )($RE{'comment'}*)
 /x;
 
@@ -163,7 +155,7 @@ sub parse {
         # group           =   display-name ":" [group-list] ";" [CFWS]
         # group-list      =   mailbox-list / CFWS / obs-group-list
         # obs-group-list  =   1*([CFWS] ",") [CFWS])
-        if ( !$in_group && $line =~ s/^$RE{'display_name'}://o ) {
+        if ( !$in_group && $line =~ s/^$RE{'display-name'}://o ) {
             $in_group = 1; next;
         }
         if ( $in_group && $line =~ s/^;// ) {
@@ -186,6 +178,16 @@ sub parse {
             );
             next;
         }
+        elsif ( $line =~ s/^($CRE{'obs-mailbox'})//o ) {
+            my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
+                $1,$2,$3,$4,$5,$6,$7
+            );
+            push @res, Email::Address->new(
+                $phrase, "$user\@$host", join(' ', grep defined, @comments),
+                $original,
+            );
+            next;
+        }
 
         # if we got here then something unknown on our way
         # try to recorver
@@ -197,50 +199,46 @@ sub parse {
     return @res;
 }
 
+my $dequote = sub {
+    local $_ = shift;
+    s/^"//; s/"$//; s/\\(.)/$1/g;
+    return "$_";
+};
+my $quote = sub {
+    local $_ = shift;
+    s/([\\"])/\\$1/g;
+    return qq{"$_"};
+};
+
 sub _process_mailbox {
-    my $self = $_[0];
-    my ($original, $phrase) = ($_[1],$_[2]);
-    my $address = $_[4] || $_[6];
-    my @rest = ($_[3],$_[5],$_[7]);
+    my $self = shift;
+    my $original = shift;
+    my @rest = (@_);
 
     my @comments;
+    foreach ( grep defined, splice @rest ) {
+        s{ ($RE{'quoted_string'}) | ($RE{comment}) }
+         { $1? $1 : do { push @comments, $2; $comments[-1]=~/^\s|\s$/? ' ' : '' } }xgoe;
+        s/^\s+//; s/\s+$//;
+        next unless length;
 
-    if ( $phrase ) {
-        my @tmp = $phrase =~ /$CRE{'dword'}/go; # must match everything
-        $phrase = '';
-        while ( my ($lcomment, $text, $quoted, $rcomment) = splice @tmp, 0, 4 ) {
-            $phrase .= ' ' if $lcomment =~ /^\s|\s$/ && $phrase !~ /\s$/;
-            push @comments, $lcomment;
-            if (defined $text) {
-                $text =~ s{($RE{'comment'})}{
-                    push @comments, $1; $comments[-1]=~ /^\s|\s$/? ' ':''
-                }xgeo;
-                $text =~ s/\s+/ /g;
-                $phrase .= $text;
-            } else {
-                $quoted =~ s/\\(.)/$1/g;
-                $phrase .= $quoted;
-            }
-            $phrase .= ' ' if $rcomment =~ /^\s|\s$/ && $phrase !~ /\s$/;
-            push @comments, $rcomment;
-        }
+        push @rest, $_;
     }
-    push @comments, shift @rest;
+    my ($host, $user, $phrase) = reverse @rest;
 
-    my ($user, $host);
-    {
-        $address =~ /^$CRE{'addr_spec'}$/;
-        ($user, $host) = ($2, $5);
-        push @comments, $1, $3, $4, $6;
-    }
-    push @comments, splice @rest;
+    # deal with spaces out of quoted strings
+    s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : ' ' }xgoe
+        foreach grep defined, $phrase;
+    s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : '' }xgoe
+        foreach $user, $host;
 
-    for ( $phrase, $user, $host, @comments ) {
-        next unless defined $_;
-        s/^\s+//;
-        s/\s+$//;
-        $_ = undef unless length $_;
-    }
-    return $original, $phrase, $user, $host, grep defined, @comments;
+    # dequote
+    s{ ($RE{'quoted_string'}) }{ $dequote->($1) }xgoe
+        foreach grep defined, $phrase, $user;
+    $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
+
+    @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments;
+    return $original, $phrase, $user, $host, @comments;
 }
 
+
diff --git a/t/data/RFC5233.single.obs.json b/t/data/RFC5233.single.obs.json
index 1d6f980..7810c3c 100644
--- a/t/data/RFC5233.single.obs.json
+++ b/t/data/RFC5233.single.obs.json
@@ -16,11 +16,49 @@
       "display-name" : "Mary Smith"
    },
    {
+      "domain" : "test.example",
+      "address" : "jdoe at test.example",
+      "mailbox" : "jdoe at test  . example",
+      "comments" : [],
+      "description" : "Appendix A.6.1.  Obsolete Addressing",
+      "display-name" : null
+   },
+   {
+      "domain" : "machine.example",
+      "address" : "jdoe at machine.example",
+      "mailbox" : "John Doe <jdoe at machine(comment).  example>",
+      "comments" : [
+         "(comment)"
+      ],
+      "description" : "Appendix A.6.3.  Obsolete White Space and Comments",
+      "display-name" : "John Doe"
+   },
+   {
       "domain" : "example.net",
       "address" : "mary at example.net",
       "mailbox" : "Mary Smith\n  \n             <mary at example.net>",
       "comments" : [],
       "description" : "Appendix A.6.3.  Obsolete White Space and Comments",
       "display-name" : "Mary Smith"
+   },
+   {
+      "domain" : "domain.com",
+      "address" : "baz.zoo at domain.com",
+      "mailbox" : "(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >",
+      "comments" : [
+         "(c1)",
+         "(c2)",
+         "(c3)",
+         "(c4)",
+         "(c5)",
+         "(c6)",
+         "(c7)",
+         "(c8)",
+         "(c9)",
+         "(c10)",
+         "(c11)"
+      ],
+      "description" : "comments all over the place",
+      "display-name" : "foo bar"
    }
 ]
diff --git a/t/data/RFC5233.single.obs.txt b/t/data/RFC5233.single.obs.txt
index 593621c..e13b3ab 100644
--- a/t/data/RFC5233.single.obs.txt
+++ b/t/data/RFC5233.single.obs.txt
@@ -13,4 +13,7 @@ John Doe <jdoe at machine(comment).  example>
 # Appendix A.6.3.  Obsolete White Space and Comments
 Mary Smith
   
-             <mary at example.net>
\ No newline at end of file
+             <mary at example.net>
+
+# comments all over the place
+(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >
\ No newline at end of file
diff --git a/t/data/RFC5233.single.valid.json b/t/data/RFC5233.single.valid.json
index 219ceeb..6c7e905 100644
--- a/t/data/RFC5233.single.valid.json
+++ b/t/data/RFC5233.single.valid.json
@@ -104,11 +104,19 @@
       "display-name" : "()<>[]:;@\\,.\""
    },
    {
-      "domain" : "x.example.com",
-      "address" : "fbaz at x.example.com",
-      "mailbox" : "Foo B. Baz <fbaz at x.example.com>",
-      "comments" : [],
-      "description" : "display-name with obs-phrase (with dots)",
-      "display-name" : "Foo B. Baz"
+      "domain" : "domain.com",
+      "address" : "baz at domain.com",
+      "mailbox" : "(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >\n",
+      "comments" : [
+         "(c1)",
+         "(c2)",
+         "(c3)",
+         "(c4)",
+         "(c5)",
+         "(c6)",
+         "(c7)"
+      ],
+      "description" : "comments with spaces in every possible possition",
+      "display-name" : "foo bar"
    }
 ]
diff --git a/t/data/RFC5233.single.valid.txt b/t/data/RFC5233.single.valid.txt
index bb7a8fa..9f2310d 100644
--- a/t/data/RFC5233.single.valid.txt
+++ b/t/data/RFC5233.single.valid.txt
@@ -31,10 +31,8 @@ Chris Jones <c@(Chris's host.)public.example>
 # Appendix A.1.2.  Different Types of Mailboxes
 John <jdoe at one.test> (my dear friend)
 
-
 # special in display-name with quoting
 "()<>[]:;@\\,.\"" <foo at bar.biz>
 
-# display-name with obs-phrase (with dots)
-Foo B. Baz <fbaz at x.example.com>
-
+# comments with spaces in every possible possition
+(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
diff --git a/t/generate.pl b/t/generate.pl
index b986a19..eb53fde 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -25,8 +25,10 @@ sub process_file {
 
         my @parse;
         unless ( @parse = ($mailbox =~ /^($CRE{'mailbox'})$/) ) {
-            warn "Failed to parse $mailbox";
-            next;
+            unless ( @parse = ($mailbox =~ /^($CRE{'obs-mailbox'})$/) ) {
+                warn "Failed to parse $mailbox";
+                next;
+            }
         }
 
         my (undef, $display_name, $local_part, $domain, @comments)

commit 2763bb517f1671c72e8f6a8daa5752c617055514
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Wed Nov 14 20:06:47 2012 +0400

    more tests in data in suite

diff --git a/t/data/RFC5233.single.obs.json b/t/data/RFC5233.single.obs.json
index 7810c3c..15f3b04 100644
--- a/t/data/RFC5233.single.obs.json
+++ b/t/data/RFC5233.single.obs.json
@@ -1,6 +1,7 @@
 [
    {
       "domain" : "example.com",
+      "local-part" : "john.q.public",
       "address" : "john.q.public at example.com",
       "mailbox" : "Joe Q. Public <john.q.public at example.com>",
       "comments" : [],
@@ -9,6 +10,7 @@
    },
    {
       "domain" : "example.net",
+      "local-part" : "mary",
       "address" : "mary at example.net",
       "mailbox" : "Mary Smith <@node.test:mary at example.net>",
       "comments" : [],
@@ -17,6 +19,7 @@
    },
    {
       "domain" : "test.example",
+      "local-part" : "jdoe",
       "address" : "jdoe at test.example",
       "mailbox" : "jdoe at test  . example",
       "comments" : [],
@@ -25,6 +28,7 @@
    },
    {
       "domain" : "machine.example",
+      "local-part" : "jdoe",
       "address" : "jdoe at machine.example",
       "mailbox" : "John Doe <jdoe at machine(comment).  example>",
       "comments" : [
@@ -35,6 +39,7 @@
    },
    {
       "domain" : "example.net",
+      "local-part" : "mary",
       "address" : "mary at example.net",
       "mailbox" : "Mary Smith\n  \n             <mary at example.net>",
       "comments" : [],
@@ -43,6 +48,7 @@
    },
    {
       "domain" : "domain.com",
+      "local-part" : "baz.zoo",
       "address" : "baz.zoo at domain.com",
       "mailbox" : "(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >",
       "comments" : [
diff --git a/t/data/RFC5233.single.valid.json b/t/data/RFC5233.single.valid.json
index 6c7e905..7b2efb1 100644
--- a/t/data/RFC5233.single.valid.json
+++ b/t/data/RFC5233.single.valid.json
@@ -1,6 +1,7 @@
 [
    {
       "domain" : "machine.example",
+      "local-part" : "jdoe",
       "address" : "jdoe at machine.example",
       "mailbox" : "John Doe <jdoe at machine.example>",
       "comments" : [],
@@ -9,6 +10,7 @@
    },
    {
       "domain" : "example.net",
+      "local-part" : "mary",
       "address" : "mary at example.net",
       "mailbox" : "Mary Smith <mary at example.net>",
       "comments" : [],
@@ -17,6 +19,7 @@
    },
    {
       "domain" : "example.com",
+      "local-part" : "john.q.public",
       "address" : "john.q.public at example.com",
       "mailbox" : "\"Joe Q. Public\" <john.q.public at example.com>",
       "comments" : [],
@@ -25,6 +28,7 @@
    },
    {
       "domain" : "x.test",
+      "local-part" : "mary",
       "address" : "mary at x.test",
       "mailbox" : "Mary Smith <mary at x.test>",
       "comments" : [],
@@ -33,6 +37,7 @@
    },
    {
       "domain" : "example.org",
+      "local-part" : "jdoe",
       "address" : "jdoe at example.org",
       "mailbox" : "jdoe at example.org",
       "comments" : [],
@@ -41,6 +46,7 @@
    },
    {
       "domain" : "y.test",
+      "local-part" : "one",
       "address" : "one at y.test",
       "mailbox" : "Who? <one at y.test>",
       "comments" : [],
@@ -49,6 +55,7 @@
    },
    {
       "domain" : "nil.test",
+      "local-part" : "boss",
       "address" : "boss at nil.test",
       "mailbox" : "<boss at nil.test>",
       "comments" : [],
@@ -57,6 +64,7 @@
    },
    {
       "domain" : "example.net",
+      "local-part" : "sysservices",
       "address" : "sysservices at example.net",
       "mailbox" : "\"Giant; \\\"Big\\\" Box\" <sysservices at example.net>",
       "comments" : [],
@@ -65,6 +73,7 @@
    },
    {
       "domain" : "silly.test",
+      "local-part" : "pete",
       "address" : "pete at silly.test",
       "mailbox" : "Pete(A nice \\) chap) <pete(his account)@silly.test(his host)>",
       "comments" : [
@@ -77,6 +86,7 @@
    },
    {
       "domain" : "public.example",
+      "local-part" : "c",
       "address" : "c at public.example",
       "mailbox" : "Chris Jones <c@(Chris's host.)public.example>",
       "comments" : [
@@ -87,6 +97,7 @@
    },
    {
       "domain" : "one.test",
+      "local-part" : "jdoe",
       "address" : "jdoe at one.test",
       "mailbox" : "John <jdoe at one.test> (my dear friend)",
       "comments" : [
@@ -97,6 +108,7 @@
    },
    {
       "domain" : "bar.biz",
+      "local-part" : "foo",
       "address" : "foo at bar.biz",
       "mailbox" : "\"()<>[]:;@\\\\,.\\\"\" <foo at bar.biz>",
       "comments" : [],
@@ -105,8 +117,9 @@
    },
    {
       "domain" : "domain.com",
+      "local-part" : "baz",
       "address" : "baz at domain.com",
-      "mailbox" : "(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >\n",
+      "mailbox" : " (c1) foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >",
       "comments" : [
          "(c1)",
          "(c2)",
@@ -118,5 +131,23 @@
       ],
       "description" : "comments with spaces in every possible possition",
       "display-name" : "foo bar"
+   },
+   {
+      "domain" : "bar.info",
+      "local-part" : "foo",
+      "address" : "foo at bar.info",
+      "mailbox" : "foo \"\\\\\" bar <foo at bar.info>",
+      "comments" : [],
+      "description" : "phrase with strange quoting",
+      "display-name" : "foo \"\\\" bar"
+   },
+   {
+      "domain" : "zoo.info",
+      "local-part" : "foo.bar",
+      "address" : "foo.bar at zoo.info",
+      "mailbox" : "\"foo.bar\"@zoo.info",
+      "comments" : [],
+      "description" : "local part with redundant quoting",
+      "display-name" : null
    }
 ]
diff --git a/t/data/RFC5233.single.valid.txt b/t/data/RFC5233.single.valid.txt
index 9f2310d..3009a0d 100644
--- a/t/data/RFC5233.single.valid.txt
+++ b/t/data/RFC5233.single.valid.txt
@@ -35,4 +35,12 @@ John <jdoe at one.test> (my dear friend)
 "()<>[]:;@\\,.\"" <foo at bar.biz>
 
 # comments with spaces in every possible possition
-(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
+ (c1) foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
+
+# phrase with strange quoting
+foo "\\" bar <foo at bar.info>
+
+# local part with redundant quoting
+"foo.bar"@zoo.info
+
+
diff --git a/t/generate.pl b/t/generate.pl
index eb53fde..5ce4fa4 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -37,6 +37,8 @@ sub process_file {
         $res{'display-name'} = $display_name;
         $res{'address'} = "$local_part\@$domain";
         $res{'domain'} = $domain;
+        $res{'local-part'} = $local_part;
+        $res{'local-part'} =~ s/\\(.)/$1/g if $res{'local-part'} =~ s/^"(.*)"$/$1/;
         $res{'comments'} = \@comments;
         push @list, \%res;
     }

commit 4341cba0fe0fb1207f914bf83f26e9bec1142bb4
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Sat Nov 17 00:16:52 2012 +0400

    another mass update
    
    * tests
    * anchor mailbox at the end with ;|,|$ otherwise we could match less
    * jump over comments and quoted strings in recover from error code
    * documentation

diff --git a/README b/README
index b51aaa0..a2742ce 100644
--- a/README
+++ b/README
@@ -2,46 +2,85 @@ NAME
     Email::Address::List - RFC close address list parsing
 
 DESCRIPTION
-    Parser for From, To, Cc, Bcc, Reply-To, Sender and previouse prefixed
-    with Resent- prefix (eg Resent-From).
+    Parser for From, To, Cc, Bcc, Reply-To, Sender and previous prefixed
+    with Resent- (eg Resent-From) headers.
 
 REASONING
-    Email::Address is good at parsing addresses out of text and this module
-    uses regexpes from the module to parse addresses.
-
-       mailbox         =   name-addr / addr-spec
-       name-addr       =   [display-name] angle-addr
-       angle-addr      =   [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
-       display-name    =   phrase
-
-       from            =   "From:" mailbox-list CRLF
-       sender          =   "Sender:" mailbox CRLF
-       reply-to        =   "Reply-To:" address-list CRLF
-
-       to              =   "To:" address-list CRLF
-       cc              =   "Cc:" address-list CRLF
-       bcc             =   "Bcc:" [address-list / CFWS] CRLF
-
-       resent-from     =   "Resent-From:" mailbox-list CRLF
-       resent-sender   =   "Resent-Sender:" mailbox CRLF
-       resent-to       =   "Resent-To:" address-list CRLF
-       resent-cc       =   "Resent-Cc:" address-list CRLF
-       resent-bcc      =   "Resent-Bcc:" [address-list / CFWS] CRLF
-
-       obs-from        =   "From" *WSP ":" mailbox-list CRLF
-       obs-sender      =   "Sender" *WSP ":" mailbox CRLF
-       obs-reply-to    =   "Reply-To" *WSP ":" address-list CRLF
-
-       obs-to          =   "To" *WSP ":" address-list CRLF
-       obs-cc          =   "Cc" *WSP ":" address-list CRLF
-       obs-bcc         =   "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
-
-       obs-resent-from =   "Resent-From" *WSP ":" mailbox-list CRLF
-       obs-resent-send =   "Resent-Sender" *WSP ":" mailbox CRLF
-       obs-resent-date =   "Resent-Date" *WSP ":" date-time CRLF
-       obs-resent-to   =   "Resent-To" *WSP ":" address-list CRLF
-       obs-resent-cc   =   "Resent-Cc" *WSP ":" address-list CRLF
-       obs-resent-bcc  =   "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
-       obs-resent-mid  =   "Resent-Message-ID" *WSP ":" msg-id CRLF
-       obs-resent-rply =   "Resent-Reply-To" *WSP ":" address-list CRLF
+    Email::Address is good at parsing addresses out of any text even
+    mentioned headers and this module is derived work from Email::Address.
+
+    However, mentioned headers are structured and contain lists of
+    addresses. Most of the time you want to parse it from start to end
+    keeping every bit even if it's a invalid input.
+
+METHODS
+  parse
+    A class method that takes a header value (w/o name and :) and a set of
+    named options. See below.
+
+    Returns list of hashes. Each hash at least has 'type' key that describes
+    the entry. Types:
+
+    mailbox
+        A mailbox entry with Email::Address object under value key.
+
+        If mailbox has obsolete parts then 'obsolete' is true.
+
+        If address (not display-name/phrase or comments, but
+        local-part at domain) contains not ASCII chars then 'not_ascii' is set
+        to true. According to RFC 5322 not ASCII chars are not allowed
+        within mailbox. However, there are no big problems if those are used
+        and actually RFC 6532 extends a few rules from 5322 with
+        UTF8-non-ascii. Either use the feature or just skip such addresses
+        with skip_not_ascii option.
+
+    group start
+        Some headers with mailboxes may contain groupped addresses. This
+        element is returned for position where group starts. Under value key
+        you find name of the group. NOTE that value is not post processed at
+        the moment, so it may contain spaces, comments, quoted strings and
+        other noise. Author willing to take patches and warns that this will
+        be changed at some point without additional notifications, so if you
+        need groups info then you better send a patch :)
+
+        Groups can not be nested, but one field may have multiple groups or
+        mix of addresses that are in a group and not in any.
+
+        See skip_groups option.
+
+    group end
+        Returned when a group ends.
+
+    comment
+        Obsolete syntax allows to use standalone comments between mailboxes
+        that can not be addressed to any mailbox. In such situations a
+        comment returned as an entry of this type. Comment itself is under
+        value.
+
+    unknown
+        Returned if parser met something that shouldn't be there. Parser
+        tries to recover by jumping over to next comma (or semicolon if
+        inside group) that is out quoted string or comment, so "foo, bar,
+        baz" string results in three unknown entries. Jumping over comments
+        and quoted strings means that parser is very sensitive to unbalanced
+        quotes and parens, but it's on purpose.
+
+    It can be controlled which elements are skipped, for example:
+
+        Email::Address::List->parse($line, skip_unknown => 1, ...);
+
+    skip_comments
+        Skips comments between mailboxes. Comments inside and next to a
+        mailbox are not skipped, but returned as part of mailbox entry.
+
+    skip_not_ascii
+        Skips mailboxes where address part has not ASCII characters.
+
+    skip_groups
+        Skips group starts and end elements, however emails within groups
+        are still returned.
+
+    skip_unknown
+        Skip anything that is not recognizable. It still tries to recover as
+        described earlier.
 
diff --git a/lib/Email/Address/List.pm b/lib/Email/Address/List.pm
index 7977ce7..ae65c75 100644
--- a/lib/Email/Address/List.pm
+++ b/lib/Email/Address/List.pm
@@ -20,6 +20,100 @@ L<Email::Address> is good at parsing addresses out of any text
 even mentioned headers and this module is derived work
 from Email::Address.
 
+However, mentioned headers are structured and contain lists
+of addresses. Most of the time you want to parse it from start
+to end keeping every bit even if it's a invalid input.
+
+=head1 METHODS
+
+=head2 parse
+
+A class method that takes a header value (w/o name and :) and
+a set of named options. See below.
+
+Returns list of hashes. Each hash at least has 'type' key that
+describes the entry. Types:
+
+=over 4
+
+=item mailbox
+
+A mailbox entry with L<Email::Address> object under value key.
+
+If mailbox has obsolete parts then 'obsolete' is true.
+
+If address (not display-name/phrase or comments, but
+local-part at domain) contains not ASCII chars then 'not_ascii' is
+set to true. According to RFC 5322 not ASCII chars are not
+allowed within mailbox. However, there are no big problems if
+those are used and actually RFC 6532 extends a few rules
+from 5322 with UTF8-non-ascii. Either use the feature or just
+skip such addresses with skip_not_ascii option.
+
+=item group start
+
+Some headers with mailboxes may contain groupped addresses. This
+element is returned for position where group starts. Under value
+key you find name of the group. B<NOTE> that value is not post
+processed at the moment, so it may contain spaces, comments,
+quoted strings and other noise. Author willing to take patches
+and warns that this will be changed at some point without additional
+notifications, so if you need groups info then you better send a
+patch :)
+
+Groups can not be nested, but one field may have multiple groups or
+mix of addresses that are in a group and not in any.
+
+See skip_groups option.
+
+=item group end
+
+Returned when a group ends.
+
+=item comment
+
+Obsolete syntax allows to use standalone comments between mailboxes
+that can not be addressed to any mailbox. In such situations a comment
+returned as an entry of this type. Comment itself is under value.
+
+=item unknown
+
+Returned if parser met something that shouldn't be there. Parser
+tries to recover by jumping over to next comma (or semicolon if inside
+group) that is out quoted string or comment, so "foo, bar, baz" string
+results in three unknown entries. Jumping over comments and quoted strings
+means that parser is very sensitive to unbalanced quotes and parens,
+but it's on purpose.
+
+=back
+
+It can be controlled which elements are skipped, for example:
+
+    Email::Address::List->parse($line, skip_unknown => 1, ...);
+
+=over 4
+
+=item skip_comments
+
+Skips comments between mailboxes. Comments inside and next to a mailbox
+are not skipped, but returned as part of mailbox entry.
+
+=item skip_not_ascii
+
+Skips mailboxes where address part has not ASCII characters.
+
+=item skip_groups
+
+Skips group starts and end elements, however emails within groups are
+still returned.
+
+=item skip_unknown
+
+Skip anything that is not recognizable. It still tries to recover as
+described earlier.
+
+=back
+
 =cut
 
 #   mailbox         =   name-addr / addr-spec
@@ -81,7 +175,7 @@ for (1 .. $COMMENT_NEST_LEVEL) {
 $RE{'cfws'}           = qr/$RE{'comment'}|\s+/;
 
 $RE{'qcontent'}       = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
-$RE{'quoted_string'}  = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
+$RE{'quoted-string'}  = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
 
 $RE{'atom'}           = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
 
@@ -91,12 +185,11 @@ $RE{'display-name'}   = $RE{'phrase'};
 
 $RE{'dot_atom_text'}  = qr/$RE{'atext'}+(?:\.$RE{'atext'}+)*/;
 $RE{'dot_atom'}       = qr/$RE{'cfws'}*$RE{'dot_atom_text'}$RE{'cfws'}*/;
-$RE{'local-part'}     = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
+$RE{'local-part'}     = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/;
 
 $RE{'dcontent'}       = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
 $RE{'domain_literal'} = qr/$RE{'cfws'}*\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}*/;
 $RE{'domain'}         = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
-$RE{'obs-domain'}     = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
 
 $RE{'addr-spec'}      = qr/$RE{'local-part'}\@$RE{'domain'}/;
 $RE{'angle-addr'}     = qr/$RE{'cfws'}* < $RE{'addr-spec'} > $RE{'cfws'}*/x;
@@ -145,7 +238,11 @@ sub parse {
         # in obs- case we have number of optional comments/spaces/
         # address-list    =   (address *("," address)) / obs-addr-list
         # obs-addr-list   =   *([CFWS] ",") address *("," [address / CFWS]))
-        $line =~ s/^(?:$RE{'cfws'}?,)+//o;
+        if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) {
+            push @res, {type => 'comment', value => $1 }
+                if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/;
+            next;
+        }
         $line =~ s/^\s+//o;
 
         # now it's only comma separated address where address is:
@@ -155,10 +252,13 @@ sub parse {
         # group           =   display-name ":" [group-list] ";" [CFWS]
         # group-list      =   mailbox-list / CFWS / obs-group-list
         # obs-group-list  =   1*([CFWS] ",") [CFWS])
-        if ( !$in_group && $line =~ s/^$RE{'display-name'}://o ) {
+        if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) {
+            push @res, {type => 'group start', value => $1 }
+                unless $args{'skip_groups'};
             $in_group = 1; next;
         }
         if ( $in_group && $line =~ s/^;// ) {
+            push @res, {type => 'group end'} unless $args{'skip_groups'};
             $in_group = 0; next;
         }
 
@@ -168,33 +268,43 @@ sub parse {
 
         # so address-list is now comma separated list of mailboxes:
         # address-list    = (mailbox *("," mailbox))
-        if ( $line =~ s/^($CRE{'mailbox'})//o ) {
-            my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
-                $1,$2,$3,$4,$5,$6,$7
-            );
-            push @res, Email::Address->new(
-                $phrase, "$user\@$host", join(' ', grep defined, @comments),
-                $original,
-            );
-            next;
-        }
-        elsif ( $line =~ s/^($CRE{'obs-mailbox'})//o ) {
+        my $obsolete = 0;
+        if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o
+            || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1)
+        ) {
             my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
-                $1,$2,$3,$4,$5,$6,$7
-            );
-            push @res, Email::Address->new(
-                $phrase, "$user\@$host", join(' ', grep defined, @comments),
-                $original,
+                $1,$2,$3,$4,$5,$6,$7,$8,$9
             );
+            my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0;
+            next if $not_ascii && $args{skip_not_ascii};
+
+            push @res, {
+                type => 'mailbox',
+                value => Email::Address->new(
+                    $phrase, "$user\@$host", join(' ', @comments),
+                    $original,
+                ),
+                obsolete => $obsolete,
+                not_ascii => $not_ascii,
+            };
             next;
         }
 
         # if we got here then something unknown on our way
         # try to recorver
-        if ( $line =~ s/^(.+?)\s*(?=(;)|,|$)//o ) {
-            push @res, { type => 'unknown', value => $1 };
-            if ($2) { $in_group = 1 }
+        if ($in_group) {
+            if ( $line =~ s/^([^;,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*)*)(?=;|,)//o ) {
+                push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
+                next;
+            }
+        } else {
+            if ( $line =~ s/^([^,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*)*)(?=,)//o ) {
+                push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
+                next;
+            }
         }
+        push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'};
+        last;
     }
     return @res;
 }
@@ -217,8 +327,8 @@ sub _process_mailbox {
 
     my @comments;
     foreach ( grep defined, splice @rest ) {
-        s{ ($RE{'quoted_string'}) | ($RE{comment}) }
-         { $1? $1 : do { push @comments, $2; $comments[-1]=~/^\s|\s$/? ' ' : '' } }xgoe;
+        s{ ($RE{'quoted-string'}) | ($RE{comment}) }
+         { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe;
         s/^\s+//; s/\s+$//;
         next unless length;
 
@@ -227,13 +337,13 @@ sub _process_mailbox {
     my ($host, $user, $phrase) = reverse @rest;
 
     # deal with spaces out of quoted strings
-    s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : ' ' }xgoe
+    s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe
         foreach grep defined, $phrase;
-    s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : '' }xgoe
+    s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe
         foreach $user, $host;
 
     # dequote
-    s{ ($RE{'quoted_string'}) }{ $dequote->($1) }xgoe
+    s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe
         foreach grep defined, $phrase, $user;
     $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
 
diff --git a/t/basics.t b/t/basics.t
index fe27ff4..0cdad23 100644
--- a/t/basics.t
+++ b/t/basics.t
@@ -2,10 +2,9 @@ use strict; use warnings;
 use Test::More tests => 3;
 use_ok 'Email::Address::List';
 
-use Scalar::Util qw(blessed);
 {
     my @addresses = Email::Address::List->parse(q{ruz at bestpractical.com});
     is scalar @addresses, 1;
-    is $addresses[0]->format, q{ruz at bestpractical.com};
+    is $addresses[0]{'value'}->format, q{ruz at bestpractical.com};
 }
 
diff --git a/t/generate.pl b/t/generate.pl
index 5ce4fa4..dfbc41c 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -1,7 +1,6 @@
 use strict; use warnings; use autodie; use lib 'lib/';
 use Email::Address::List;
 
-my $file = 't/data/RFC5233.single.valid.txt';
 foreach my $file (qw(t/data/RFC5233.single.valid.txt t/data/RFC5233.single.obs.txt)) {
     process_file($file);
 }
diff --git a/t/invalid.t b/t/invalid.t
new file mode 100644
index 0000000..4a00639
--- /dev/null
+++ b/t/invalid.t
@@ -0,0 +1,28 @@
+use strict; use warnings;
+use Test::More tests => 4;
+use_ok 'Email::Address::List';
+
+run_test('root', [{type => 'unknown', value => 'root'}]);
+run_test(
+    'boo at boo, root, foo at foo',
+    [
+        {type => 'mailbox', value => 'boo at boo', obsolete => 0, not_ascii => 0},
+        {type => 'unknown', value => 'root'},
+        {type => 'mailbox', value => 'foo at foo', obsolete => 0, not_ascii => 0},
+    ],
+);
+run_test(
+    '"Doe, John" foo at foo, root',
+    [
+        {type => 'unknown', value => '"Doe, John" foo at foo' },
+        {type => 'unknown', value => 'root'},
+    ],
+);
+
+sub run_test {
+    my $line = shift;
+    my @list = Email::Address::List->parse($line);
+    $_->{'value'} .= '' foreach grep defined $_->{'value'}, @list;
+    is_deeply( \@list, shift );
+}
+
diff --git a/t/random.combinations.t b/t/random.combinations.t
new file mode 100644
index 0000000..1b1b2a4
--- /dev/null
+++ b/t/random.combinations.t
@@ -0,0 +1,60 @@
+use strict;
+use warnings;
+
+use Test::More;
+use JSON ();
+
+use_ok('Email::Address::List');
+
+my @data;
+foreach my $file (qw(t/data/RFC5233.single.valid.json t/data/RFC5233.single.obs.json)) {
+    my $obsolete = $file =~ /\bobs\b/? 1 : 0;
+
+    open my $fh, '<', $file;
+    push @data, @{ JSON->new->decode( do { local $/; <$fh> } ) };
+    close $fh;
+}
+
+diag "srand is ". (my $seed = int rand( 2**16-1 ));
+srand($seed);
+
+for (1..100) {
+    my @list;
+    push @list, $data[ rand @data ] for 1..3;
+
+    my $line = join ', ', map $_->{'mailbox'}, @list;
+    note $line;
+
+    my @res = Email::Address::List->parse( $line );
+    is scalar @res, scalar @list;
+
+    for (my $i = 0; $i < @list; $i++) {
+        my $test = $list[$i];
+        my $v = $res[$i]{'value'};
+        is $v->phrase, $test->{'display-name'}, 'correct value';
+        is $v->address, $test->{'address'}, 'correct value';
+        is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+    }
+}
+
+for (1..100) {
+    my @list;
+    push @list, $data[ rand @data ] for 1..3;
+
+    my $line = join ",\n ,", '', (map $_->{'mailbox'}, @list), '';
+    note $line;
+
+    my @res = Email::Address::List->parse( $line );
+    is scalar @res, scalar @list;
+
+    for (my $i = 0; $i < @list; $i++) {
+        my $test = $list[$i];
+        my $v = $res[$i]{'value'};
+        is $v->phrase, $test->{'display-name'}, 'correct value';
+        is $v->address, $test->{'address'}, 'correct value';
+        is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+    }
+}
+
+done_testing;
+
diff --git a/t/single.suit.t b/t/single.suit.t
new file mode 100644
index 0000000..f00a9d9
--- /dev/null
+++ b/t/single.suit.t
@@ -0,0 +1,28 @@
+use strict;
+use warnings;
+
+use Test::More;
+use JSON ();
+
+use_ok('Email::Address::List');
+
+foreach my $file (qw(t/data/RFC5233.single.valid.json t/data/RFC5233.single.obs.json)) {
+    my $obsolete = $file =~ /\bobs\b/? 1 : 0;
+
+    open my $fh, '<', $file;
+    my $tests = JSON->new->decode( do { local $/; <$fh> } );
+    close $fh;
+
+    foreach my $test ( @$tests ) {
+        note $test->{'description'};
+        my @list = Email::Address::List->parse( $test->{'mailbox'} );
+        is scalar @list, 1, "one entry in result set" or do { use Data::Dumper; diag Dumper \@list };
+        is $list[0]{'type'}, 'mailbox', 'one mailbox';
+        my $v = $list[0]{'value'};
+        is $v->phrase, $test->{'display-name'}, 'correct value';
+        is $v->address, $test->{'address'}, 'correct value';
+        is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+    }
+}
+
+done_testing();

-----------------------------------------------------------------------



More information about the Bps-public-commit mailing list