[Bps-public-commit] email-address-list branch, master, updated. 4341cba0fe0fb1207f914bf83f26e9bec1142bb4
Ruslan Zakirov
ruz at bestpractical.com
Fri Nov 16 15:20:12 EST 2012
The branch, master has been updated
via 4341cba0fe0fb1207f914bf83f26e9bec1142bb4 (commit)
via 2763bb517f1671c72e8f6a8daa5752c617055514 (commit)
via 2a6c01ebd65be19e679eee9254cd184bef14d0ec (commit)
from 9bf352154206c4272a357cf241de42b1761cbf09 (commit)
Summary of changes:
README | 119 ++++++++++------
lib/Email/Address/List.pm | 286 +++++++++++++++++++++++++++------------
t/basics.t | 3 +-
t/data/RFC5233.single.obs.json | 44 ++++++
t/data/RFC5233.single.obs.txt | 5 +-
t/data/RFC5233.single.valid.json | 49 ++++++-
t/data/RFC5233.single.valid.txt | 12 +-
t/generate.pl | 9 +-
t/invalid.t | 28 ++++
t/random.combinations.t | 60 ++++++++
t/single.suit.t | 28 ++++
11 files changed, 500 insertions(+), 143 deletions(-)
create mode 100644 t/invalid.t
create mode 100644 t/random.combinations.t
create mode 100644 t/single.suit.t
- Log -----------------------------------------------------------------
commit 2a6c01ebd65be19e679eee9254cd184bef14d0ec
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Wed Nov 14 19:31:11 2012 +0400
mass update
* simplify post processing
* full support of obsolete syntax
* dequote phrase and local-part
* normalize phrase and local-part quoting
diff --git a/lib/Email/Address/List.pm b/lib/Email/Address/List.pm
index 6d2078f..7977ce7 100644
--- a/lib/Email/Address/List.pm
+++ b/lib/Email/Address/List.pm
@@ -71,6 +71,7 @@ $RE{'quoted_pair'} = qr/\\$RE{'text'}/;
$RE{'atext'} = qr/[^$RE{'CTL'}$RE{'special'}\s]/;
$RE{'ctext'} = qr/(?>[^()\\]+)/;
$RE{'qtext'} = qr/[^\\"]/;
+$RE{'dtext'} = qr/[^\[\]\\]/;
($RE{'ccontent'}, $RE{'comment'}) = (q{})x2;
for (1 .. $COMMENT_NEST_LEVEL) {
@@ -82,62 +83,53 @@ $RE{'cfws'} = qr/$RE{'comment'}|\s+/;
$RE{'qcontent'} = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
$RE{'quoted_string'} = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
-# by the spec:
-# word = atom / quoted-string = qr/$RE{'atom'}|$RE{'quoted_string'}/;
-# atom = [CFWS] 1*atext [CFWS] = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
-# with some inlining:
-# word = qr/$RE{'cfws'}* (?:$RE{'atom'} | "$RE{'qcontent'}+") $RE{'cfws'}*/x;
-# however:
-# phrase = 1*word / obs-phrase
-# obs-phrase = word *(word / "." / CFWS)
-# after combining:
-# phrase = word *(word / "." / CFWS)
-
$RE{'atom'} = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
-$CRE{'atom'} = qr/($RE{'cfws'}*)($RE{'atext'}+)($RE{'cfws'}*)/;
-$RE{'word'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
-$RE{'dword'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | \. | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
-$CRE{'dword'} = qr/($RE{'cfws'}*) (?: ($RE{'atom'} | \.) | "($RE{'qcontent'}+)" ) ($RE{'cfws'}*)/x;
-$RE{'phrase'} = qr/$RE{'word'} $RE{'dword'}*/x;
-$RE{'display_name'} = $RE{'phrase'};
+$RE{'word'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
+$RE{'phrase'} = qr/$RE{'word'}+/x;
+$RE{'display-name'} = $RE{'phrase'};
$RE{'dot_atom_text'} = qr/$RE{'atext'}+(?:\.$RE{'atext'}+)*/;
$RE{'dot_atom'} = qr/$RE{'cfws'}*$RE{'dot_atom_text'}$RE{'cfws'}*/;
-$RE{'local_part'} = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
+$RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
-$RE{'dtext'} = qr/[^\[\]\\]/;
$RE{'dcontent'} = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
$RE{'domain_literal'} = qr/$RE{'cfws'}*\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}*/;
$RE{'domain'} = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
-$CRE{'domain'} = qr/
- ($RE{'cfws'}*)
- ($RE{'dot_atom_text'}|\[(?:\s*$RE{'dcontent'})*\s*\])
- ($RE{'cfws'}*)
-/x;
+$RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
-$RE{'addr_spec'} = qr/$RE{'local_part'}\@$RE{'domain'}/;
-$CRE{'addr_spec'} = qr/
- ($RE{'cfws'}*)
- ($RE{'dot_atom_text'}|"$RE{'qcontent'}+")
- ($RE{'cfws'}*)
- \@$CRE{'domain'}
+$RE{'addr-spec'} = qr/$RE{'local-part'}\@$RE{'domain'}/;
+$RE{'angle-addr'} = qr/$RE{'cfws'}* < $RE{'addr-spec'} > $RE{'cfws'}*/x;
+
+$RE{'name-addr'} = qr/$RE{'display-name'}?$RE{'angle-addr'}/;
+$RE{'mailbox'} = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/;
+
+$CRE{'addr-spec'} = qr/($RE{'local-part'})\@($RE{'domain'})/;
+$CRE{'mailbox'} = qr/
+ (?:
+ ($RE{'display-name'})?($RE{'cfws'}*)<$CRE{'addr-spec'}>($RE{'cfws'}*)
+ |$CRE{'addr-spec'}
+ )($RE{'comment'}*)
/x;
-$RE{'obs-route'} = qr/
+
+$RE{'dword'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | \. | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
+$RE{'obs-phrase'} = qr/$RE{'word'} $RE{'dword'}*/x;
+$RE{'obs-display-name'} = $RE{'obs-phrase'};
+$RE{'obs-route'} = qr/
(?:$RE{'cfws'}|,)*
\@$RE{'domain'}
(?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)*
:
/x;
-$RE{'angle_addr'} = qr/$RE{'cfws'}* < $RE{'obs-route'}? $RE{'addr_spec'} > $RE{'cfws'}*/x;
-
-$RE{'name_addr'} = qr/$RE{'display_name'}?$RE{'angle_addr'}/;
-$RE{'mailbox'} = qr/(?:$RE{'name_addr'}|$RE{'addr_spec'})$RE{'comment'}*/;
-
-$CRE{'mailbox'} = qr/
+$RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
+$RE{'obs-local-part'} = qr/$RE{'word'}(?:\.$RE{'word'})*/;
+$RE{'obs-addr-spec'} = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/;
+$CRE{'obs-addr-spec'} = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/;
+$CRE{'obs-mailbox'} = qr/
(?:
- ($RE{'display_name'})?($RE{'cfws'}*)< $RE{'obs-route'}? ($RE{'addr_spec'})>($RE{'cfws'}*)
- |($RE{'addr_spec'})
+ ($RE{'obs-display-name'})?
+ ($RE{'cfws'}*)< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'}*)
+ |$CRE{'obs-addr-spec'}
)($RE{'comment'}*)
/x;
@@ -163,7 +155,7 @@ sub parse {
# group = display-name ":" [group-list] ";" [CFWS]
# group-list = mailbox-list / CFWS / obs-group-list
# obs-group-list = 1*([CFWS] ",") [CFWS])
- if ( !$in_group && $line =~ s/^$RE{'display_name'}://o ) {
+ if ( !$in_group && $line =~ s/^$RE{'display-name'}://o ) {
$in_group = 1; next;
}
if ( $in_group && $line =~ s/^;// ) {
@@ -186,6 +178,16 @@ sub parse {
);
next;
}
+ elsif ( $line =~ s/^($CRE{'obs-mailbox'})//o ) {
+ my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
+ $1,$2,$3,$4,$5,$6,$7
+ );
+ push @res, Email::Address->new(
+ $phrase, "$user\@$host", join(' ', grep defined, @comments),
+ $original,
+ );
+ next;
+ }
# if we got here then something unknown on our way
# try to recorver
@@ -197,50 +199,46 @@ sub parse {
return @res;
}
+my $dequote = sub {
+ local $_ = shift;
+ s/^"//; s/"$//; s/\\(.)/$1/g;
+ return "$_";
+};
+my $quote = sub {
+ local $_ = shift;
+ s/([\\"])/\\$1/g;
+ return qq{"$_"};
+};
+
sub _process_mailbox {
- my $self = $_[0];
- my ($original, $phrase) = ($_[1],$_[2]);
- my $address = $_[4] || $_[6];
- my @rest = ($_[3],$_[5],$_[7]);
+ my $self = shift;
+ my $original = shift;
+ my @rest = (@_);
my @comments;
+ foreach ( grep defined, splice @rest ) {
+ s{ ($RE{'quoted_string'}) | ($RE{comment}) }
+ { $1? $1 : do { push @comments, $2; $comments[-1]=~/^\s|\s$/? ' ' : '' } }xgoe;
+ s/^\s+//; s/\s+$//;
+ next unless length;
- if ( $phrase ) {
- my @tmp = $phrase =~ /$CRE{'dword'}/go; # must match everything
- $phrase = '';
- while ( my ($lcomment, $text, $quoted, $rcomment) = splice @tmp, 0, 4 ) {
- $phrase .= ' ' if $lcomment =~ /^\s|\s$/ && $phrase !~ /\s$/;
- push @comments, $lcomment;
- if (defined $text) {
- $text =~ s{($RE{'comment'})}{
- push @comments, $1; $comments[-1]=~ /^\s|\s$/? ' ':''
- }xgeo;
- $text =~ s/\s+/ /g;
- $phrase .= $text;
- } else {
- $quoted =~ s/\\(.)/$1/g;
- $phrase .= $quoted;
- }
- $phrase .= ' ' if $rcomment =~ /^\s|\s$/ && $phrase !~ /\s$/;
- push @comments, $rcomment;
- }
+ push @rest, $_;
}
- push @comments, shift @rest;
+ my ($host, $user, $phrase) = reverse @rest;
- my ($user, $host);
- {
- $address =~ /^$CRE{'addr_spec'}$/;
- ($user, $host) = ($2, $5);
- push @comments, $1, $3, $4, $6;
- }
- push @comments, splice @rest;
+ # deal with spaces out of quoted strings
+ s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : ' ' }xgoe
+ foreach grep defined, $phrase;
+ s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : '' }xgoe
+ foreach $user, $host;
- for ( $phrase, $user, $host, @comments ) {
- next unless defined $_;
- s/^\s+//;
- s/\s+$//;
- $_ = undef unless length $_;
- }
- return $original, $phrase, $user, $host, grep defined, @comments;
+ # dequote
+ s{ ($RE{'quoted_string'}) }{ $dequote->($1) }xgoe
+ foreach grep defined, $phrase, $user;
+ $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
+
+ @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments;
+ return $original, $phrase, $user, $host, @comments;
}
+
diff --git a/t/data/RFC5233.single.obs.json b/t/data/RFC5233.single.obs.json
index 1d6f980..7810c3c 100644
--- a/t/data/RFC5233.single.obs.json
+++ b/t/data/RFC5233.single.obs.json
@@ -16,11 +16,49 @@
"display-name" : "Mary Smith"
},
{
+ "domain" : "test.example",
+ "address" : "jdoe at test.example",
+ "mailbox" : "jdoe at test . example",
+ "comments" : [],
+ "description" : "Appendix A.6.1. Obsolete Addressing",
+ "display-name" : null
+ },
+ {
+ "domain" : "machine.example",
+ "address" : "jdoe at machine.example",
+ "mailbox" : "John Doe <jdoe at machine(comment). example>",
+ "comments" : [
+ "(comment)"
+ ],
+ "description" : "Appendix A.6.3. Obsolete White Space and Comments",
+ "display-name" : "John Doe"
+ },
+ {
"domain" : "example.net",
"address" : "mary at example.net",
"mailbox" : "Mary Smith\n \n <mary at example.net>",
"comments" : [],
"description" : "Appendix A.6.3. Obsolete White Space and Comments",
"display-name" : "Mary Smith"
+ },
+ {
+ "domain" : "domain.com",
+ "address" : "baz.zoo at domain.com",
+ "mailbox" : "(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >",
+ "comments" : [
+ "(c1)",
+ "(c2)",
+ "(c3)",
+ "(c4)",
+ "(c5)",
+ "(c6)",
+ "(c7)",
+ "(c8)",
+ "(c9)",
+ "(c10)",
+ "(c11)"
+ ],
+ "description" : "comments all over the place",
+ "display-name" : "foo bar"
}
]
diff --git a/t/data/RFC5233.single.obs.txt b/t/data/RFC5233.single.obs.txt
index 593621c..e13b3ab 100644
--- a/t/data/RFC5233.single.obs.txt
+++ b/t/data/RFC5233.single.obs.txt
@@ -13,4 +13,7 @@ John Doe <jdoe at machine(comment). example>
# Appendix A.6.3. Obsolete White Space and Comments
Mary Smith
- <mary at example.net>
\ No newline at end of file
+ <mary at example.net>
+
+# comments all over the place
+(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >
\ No newline at end of file
diff --git a/t/data/RFC5233.single.valid.json b/t/data/RFC5233.single.valid.json
index 219ceeb..6c7e905 100644
--- a/t/data/RFC5233.single.valid.json
+++ b/t/data/RFC5233.single.valid.json
@@ -104,11 +104,19 @@
"display-name" : "()<>[]:;@\\,.\""
},
{
- "domain" : "x.example.com",
- "address" : "fbaz at x.example.com",
- "mailbox" : "Foo B. Baz <fbaz at x.example.com>",
- "comments" : [],
- "description" : "display-name with obs-phrase (with dots)",
- "display-name" : "Foo B. Baz"
+ "domain" : "domain.com",
+ "address" : "baz at domain.com",
+ "mailbox" : "(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >\n",
+ "comments" : [
+ "(c1)",
+ "(c2)",
+ "(c3)",
+ "(c4)",
+ "(c5)",
+ "(c6)",
+ "(c7)"
+ ],
+ "description" : "comments with spaces in every possible possition",
+ "display-name" : "foo bar"
}
]
diff --git a/t/data/RFC5233.single.valid.txt b/t/data/RFC5233.single.valid.txt
index bb7a8fa..9f2310d 100644
--- a/t/data/RFC5233.single.valid.txt
+++ b/t/data/RFC5233.single.valid.txt
@@ -31,10 +31,8 @@ Chris Jones <c@(Chris's host.)public.example>
# Appendix A.1.2. Different Types of Mailboxes
John <jdoe at one.test> (my dear friend)
-
# special in display-name with quoting
"()<>[]:;@\\,.\"" <foo at bar.biz>
-# display-name with obs-phrase (with dots)
-Foo B. Baz <fbaz at x.example.com>
-
+# comments with spaces in every possible possition
+(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
diff --git a/t/generate.pl b/t/generate.pl
index b986a19..eb53fde 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -25,8 +25,10 @@ sub process_file {
my @parse;
unless ( @parse = ($mailbox =~ /^($CRE{'mailbox'})$/) ) {
- warn "Failed to parse $mailbox";
- next;
+ unless ( @parse = ($mailbox =~ /^($CRE{'obs-mailbox'})$/) ) {
+ warn "Failed to parse $mailbox";
+ next;
+ }
}
my (undef, $display_name, $local_part, $domain, @comments)
commit 2763bb517f1671c72e8f6a8daa5752c617055514
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Wed Nov 14 20:06:47 2012 +0400
more tests in data in suite
diff --git a/t/data/RFC5233.single.obs.json b/t/data/RFC5233.single.obs.json
index 7810c3c..15f3b04 100644
--- a/t/data/RFC5233.single.obs.json
+++ b/t/data/RFC5233.single.obs.json
@@ -1,6 +1,7 @@
[
{
"domain" : "example.com",
+ "local-part" : "john.q.public",
"address" : "john.q.public at example.com",
"mailbox" : "Joe Q. Public <john.q.public at example.com>",
"comments" : [],
@@ -9,6 +10,7 @@
},
{
"domain" : "example.net",
+ "local-part" : "mary",
"address" : "mary at example.net",
"mailbox" : "Mary Smith <@node.test:mary at example.net>",
"comments" : [],
@@ -17,6 +19,7 @@
},
{
"domain" : "test.example",
+ "local-part" : "jdoe",
"address" : "jdoe at test.example",
"mailbox" : "jdoe at test . example",
"comments" : [],
@@ -25,6 +28,7 @@
},
{
"domain" : "machine.example",
+ "local-part" : "jdoe",
"address" : "jdoe at machine.example",
"mailbox" : "John Doe <jdoe at machine(comment). example>",
"comments" : [
@@ -35,6 +39,7 @@
},
{
"domain" : "example.net",
+ "local-part" : "mary",
"address" : "mary at example.net",
"mailbox" : "Mary Smith\n \n <mary at example.net>",
"comments" : [],
@@ -43,6 +48,7 @@
},
{
"domain" : "domain.com",
+ "local-part" : "baz.zoo",
"address" : "baz.zoo at domain.com",
"mailbox" : "(c1) foo (c2) bar (c3) < (c4) baz (c5) . (c6) zoo (c7) @ (c8) domain (c9) . (c10) com (c11) >",
"comments" : [
diff --git a/t/data/RFC5233.single.valid.json b/t/data/RFC5233.single.valid.json
index 6c7e905..7b2efb1 100644
--- a/t/data/RFC5233.single.valid.json
+++ b/t/data/RFC5233.single.valid.json
@@ -1,6 +1,7 @@
[
{
"domain" : "machine.example",
+ "local-part" : "jdoe",
"address" : "jdoe at machine.example",
"mailbox" : "John Doe <jdoe at machine.example>",
"comments" : [],
@@ -9,6 +10,7 @@
},
{
"domain" : "example.net",
+ "local-part" : "mary",
"address" : "mary at example.net",
"mailbox" : "Mary Smith <mary at example.net>",
"comments" : [],
@@ -17,6 +19,7 @@
},
{
"domain" : "example.com",
+ "local-part" : "john.q.public",
"address" : "john.q.public at example.com",
"mailbox" : "\"Joe Q. Public\" <john.q.public at example.com>",
"comments" : [],
@@ -25,6 +28,7 @@
},
{
"domain" : "x.test",
+ "local-part" : "mary",
"address" : "mary at x.test",
"mailbox" : "Mary Smith <mary at x.test>",
"comments" : [],
@@ -33,6 +37,7 @@
},
{
"domain" : "example.org",
+ "local-part" : "jdoe",
"address" : "jdoe at example.org",
"mailbox" : "jdoe at example.org",
"comments" : [],
@@ -41,6 +46,7 @@
},
{
"domain" : "y.test",
+ "local-part" : "one",
"address" : "one at y.test",
"mailbox" : "Who? <one at y.test>",
"comments" : [],
@@ -49,6 +55,7 @@
},
{
"domain" : "nil.test",
+ "local-part" : "boss",
"address" : "boss at nil.test",
"mailbox" : "<boss at nil.test>",
"comments" : [],
@@ -57,6 +64,7 @@
},
{
"domain" : "example.net",
+ "local-part" : "sysservices",
"address" : "sysservices at example.net",
"mailbox" : "\"Giant; \\\"Big\\\" Box\" <sysservices at example.net>",
"comments" : [],
@@ -65,6 +73,7 @@
},
{
"domain" : "silly.test",
+ "local-part" : "pete",
"address" : "pete at silly.test",
"mailbox" : "Pete(A nice \\) chap) <pete(his account)@silly.test(his host)>",
"comments" : [
@@ -77,6 +86,7 @@
},
{
"domain" : "public.example",
+ "local-part" : "c",
"address" : "c at public.example",
"mailbox" : "Chris Jones <c@(Chris's host.)public.example>",
"comments" : [
@@ -87,6 +97,7 @@
},
{
"domain" : "one.test",
+ "local-part" : "jdoe",
"address" : "jdoe at one.test",
"mailbox" : "John <jdoe at one.test> (my dear friend)",
"comments" : [
@@ -97,6 +108,7 @@
},
{
"domain" : "bar.biz",
+ "local-part" : "foo",
"address" : "foo at bar.biz",
"mailbox" : "\"()<>[]:;@\\\\,.\\\"\" <foo at bar.biz>",
"comments" : [],
@@ -105,8 +117,9 @@
},
{
"domain" : "domain.com",
+ "local-part" : "baz",
"address" : "baz at domain.com",
- "mailbox" : "(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >\n",
+ "mailbox" : " (c1) foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >",
"comments" : [
"(c1)",
"(c2)",
@@ -118,5 +131,23 @@
],
"description" : "comments with spaces in every possible possition",
"display-name" : "foo bar"
+ },
+ {
+ "domain" : "bar.info",
+ "local-part" : "foo",
+ "address" : "foo at bar.info",
+ "mailbox" : "foo \"\\\\\" bar <foo at bar.info>",
+ "comments" : [],
+ "description" : "phrase with strange quoting",
+ "display-name" : "foo \"\\\" bar"
+ },
+ {
+ "domain" : "zoo.info",
+ "local-part" : "foo.bar",
+ "address" : "foo.bar at zoo.info",
+ "mailbox" : "\"foo.bar\"@zoo.info",
+ "comments" : [],
+ "description" : "local part with redundant quoting",
+ "display-name" : null
}
]
diff --git a/t/data/RFC5233.single.valid.txt b/t/data/RFC5233.single.valid.txt
index 9f2310d..3009a0d 100644
--- a/t/data/RFC5233.single.valid.txt
+++ b/t/data/RFC5233.single.valid.txt
@@ -35,4 +35,12 @@ John <jdoe at one.test> (my dear friend)
"()<>[]:;@\\,.\"" <foo at bar.biz>
# comments with spaces in every possible possition
-(c1)foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
+ (c1) foo (c2) bar (c3) < (c4) baz (c5) @ (c6) domain.com (c7) >
+
+# phrase with strange quoting
+foo "\\" bar <foo at bar.info>
+
+# local part with redundant quoting
+"foo.bar"@zoo.info
+
+
diff --git a/t/generate.pl b/t/generate.pl
index eb53fde..5ce4fa4 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -37,6 +37,8 @@ sub process_file {
$res{'display-name'} = $display_name;
$res{'address'} = "$local_part\@$domain";
$res{'domain'} = $domain;
+ $res{'local-part'} = $local_part;
+ $res{'local-part'} =~ s/\\(.)/$1/g if $res{'local-part'} =~ s/^"(.*)"$/$1/;
$res{'comments'} = \@comments;
push @list, \%res;
}
commit 4341cba0fe0fb1207f914bf83f26e9bec1142bb4
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Sat Nov 17 00:16:52 2012 +0400
another mass update
* tests
* anchor mailbox at the end with ;|,|$ otherwise we could match less
* jump over comments and quoted strings in recover from error code
* documentation
diff --git a/README b/README
index b51aaa0..a2742ce 100644
--- a/README
+++ b/README
@@ -2,46 +2,85 @@ NAME
Email::Address::List - RFC close address list parsing
DESCRIPTION
- Parser for From, To, Cc, Bcc, Reply-To, Sender and previouse prefixed
- with Resent- prefix (eg Resent-From).
+ Parser for From, To, Cc, Bcc, Reply-To, Sender and previous prefixed
+ with Resent- (eg Resent-From) headers.
REASONING
- Email::Address is good at parsing addresses out of text and this module
- uses regexpes from the module to parse addresses.
-
- mailbox = name-addr / addr-spec
- name-addr = [display-name] angle-addr
- angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
- display-name = phrase
-
- from = "From:" mailbox-list CRLF
- sender = "Sender:" mailbox CRLF
- reply-to = "Reply-To:" address-list CRLF
-
- to = "To:" address-list CRLF
- cc = "Cc:" address-list CRLF
- bcc = "Bcc:" [address-list / CFWS] CRLF
-
- resent-from = "Resent-From:" mailbox-list CRLF
- resent-sender = "Resent-Sender:" mailbox CRLF
- resent-to = "Resent-To:" address-list CRLF
- resent-cc = "Resent-Cc:" address-list CRLF
- resent-bcc = "Resent-Bcc:" [address-list / CFWS] CRLF
-
- obs-from = "From" *WSP ":" mailbox-list CRLF
- obs-sender = "Sender" *WSP ":" mailbox CRLF
- obs-reply-to = "Reply-To" *WSP ":" address-list CRLF
-
- obs-to = "To" *WSP ":" address-list CRLF
- obs-cc = "Cc" *WSP ":" address-list CRLF
- obs-bcc = "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
-
- obs-resent-from = "Resent-From" *WSP ":" mailbox-list CRLF
- obs-resent-send = "Resent-Sender" *WSP ":" mailbox CRLF
- obs-resent-date = "Resent-Date" *WSP ":" date-time CRLF
- obs-resent-to = "Resent-To" *WSP ":" address-list CRLF
- obs-resent-cc = "Resent-Cc" *WSP ":" address-list CRLF
- obs-resent-bcc = "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
- obs-resent-mid = "Resent-Message-ID" *WSP ":" msg-id CRLF
- obs-resent-rply = "Resent-Reply-To" *WSP ":" address-list CRLF
+ Email::Address is good at parsing addresses out of any text even
+ mentioned headers and this module is derived work from Email::Address.
+
+ However, mentioned headers are structured and contain lists of
+ addresses. Most of the time you want to parse it from start to end
+ keeping every bit even if it's a invalid input.
+
+METHODS
+ parse
+ A class method that takes a header value (w/o name and :) and a set of
+ named options. See below.
+
+ Returns list of hashes. Each hash at least has 'type' key that describes
+ the entry. Types:
+
+ mailbox
+ A mailbox entry with Email::Address object under value key.
+
+ If mailbox has obsolete parts then 'obsolete' is true.
+
+ If address (not display-name/phrase or comments, but
+ local-part at domain) contains not ASCII chars then 'not_ascii' is set
+ to true. According to RFC 5322 not ASCII chars are not allowed
+ within mailbox. However, there are no big problems if those are used
+ and actually RFC 6532 extends a few rules from 5322 with
+ UTF8-non-ascii. Either use the feature or just skip such addresses
+ with skip_not_ascii option.
+
+ group start
+ Some headers with mailboxes may contain groupped addresses. This
+ element is returned for position where group starts. Under value key
+ you find name of the group. NOTE that value is not post processed at
+ the moment, so it may contain spaces, comments, quoted strings and
+ other noise. Author willing to take patches and warns that this will
+ be changed at some point without additional notifications, so if you
+ need groups info then you better send a patch :)
+
+ Groups can not be nested, but one field may have multiple groups or
+ mix of addresses that are in a group and not in any.
+
+ See skip_groups option.
+
+ group end
+ Returned when a group ends.
+
+ comment
+ Obsolete syntax allows to use standalone comments between mailboxes
+ that can not be addressed to any mailbox. In such situations a
+ comment returned as an entry of this type. Comment itself is under
+ value.
+
+ unknown
+ Returned if parser met something that shouldn't be there. Parser
+ tries to recover by jumping over to next comma (or semicolon if
+ inside group) that is out quoted string or comment, so "foo, bar,
+ baz" string results in three unknown entries. Jumping over comments
+ and quoted strings means that parser is very sensitive to unbalanced
+ quotes and parens, but it's on purpose.
+
+ It can be controlled which elements are skipped, for example:
+
+ Email::Address::List->parse($line, skip_unknown => 1, ...);
+
+ skip_comments
+ Skips comments between mailboxes. Comments inside and next to a
+ mailbox are not skipped, but returned as part of mailbox entry.
+
+ skip_not_ascii
+ Skips mailboxes where address part has not ASCII characters.
+
+ skip_groups
+ Skips group starts and end elements, however emails within groups
+ are still returned.
+
+ skip_unknown
+ Skip anything that is not recognizable. It still tries to recover as
+ described earlier.
diff --git a/lib/Email/Address/List.pm b/lib/Email/Address/List.pm
index 7977ce7..ae65c75 100644
--- a/lib/Email/Address/List.pm
+++ b/lib/Email/Address/List.pm
@@ -20,6 +20,100 @@ L<Email::Address> is good at parsing addresses out of any text
even mentioned headers and this module is derived work
from Email::Address.
+However, mentioned headers are structured and contain lists
+of addresses. Most of the time you want to parse it from start
+to end keeping every bit even if it's a invalid input.
+
+=head1 METHODS
+
+=head2 parse
+
+A class method that takes a header value (w/o name and :) and
+a set of named options. See below.
+
+Returns list of hashes. Each hash at least has 'type' key that
+describes the entry. Types:
+
+=over 4
+
+=item mailbox
+
+A mailbox entry with L<Email::Address> object under value key.
+
+If mailbox has obsolete parts then 'obsolete' is true.
+
+If address (not display-name/phrase or comments, but
+local-part at domain) contains not ASCII chars then 'not_ascii' is
+set to true. According to RFC 5322 not ASCII chars are not
+allowed within mailbox. However, there are no big problems if
+those are used and actually RFC 6532 extends a few rules
+from 5322 with UTF8-non-ascii. Either use the feature or just
+skip such addresses with skip_not_ascii option.
+
+=item group start
+
+Some headers with mailboxes may contain groupped addresses. This
+element is returned for position where group starts. Under value
+key you find name of the group. B<NOTE> that value is not post
+processed at the moment, so it may contain spaces, comments,
+quoted strings and other noise. Author willing to take patches
+and warns that this will be changed at some point without additional
+notifications, so if you need groups info then you better send a
+patch :)
+
+Groups can not be nested, but one field may have multiple groups or
+mix of addresses that are in a group and not in any.
+
+See skip_groups option.
+
+=item group end
+
+Returned when a group ends.
+
+=item comment
+
+Obsolete syntax allows to use standalone comments between mailboxes
+that can not be addressed to any mailbox. In such situations a comment
+returned as an entry of this type. Comment itself is under value.
+
+=item unknown
+
+Returned if parser met something that shouldn't be there. Parser
+tries to recover by jumping over to next comma (or semicolon if inside
+group) that is out quoted string or comment, so "foo, bar, baz" string
+results in three unknown entries. Jumping over comments and quoted strings
+means that parser is very sensitive to unbalanced quotes and parens,
+but it's on purpose.
+
+=back
+
+It can be controlled which elements are skipped, for example:
+
+ Email::Address::List->parse($line, skip_unknown => 1, ...);
+
+=over 4
+
+=item skip_comments
+
+Skips comments between mailboxes. Comments inside and next to a mailbox
+are not skipped, but returned as part of mailbox entry.
+
+=item skip_not_ascii
+
+Skips mailboxes where address part has not ASCII characters.
+
+=item skip_groups
+
+Skips group starts and end elements, however emails within groups are
+still returned.
+
+=item skip_unknown
+
+Skip anything that is not recognizable. It still tries to recover as
+described earlier.
+
+=back
+
=cut
# mailbox = name-addr / addr-spec
@@ -81,7 +175,7 @@ for (1 .. $COMMENT_NEST_LEVEL) {
$RE{'cfws'} = qr/$RE{'comment'}|\s+/;
$RE{'qcontent'} = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
-$RE{'quoted_string'} = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
+$RE{'quoted-string'} = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
$RE{'atom'} = qr/$RE{'cfws'}*$RE{'atext'}+$RE{'cfws'}*/;
@@ -91,12 +185,11 @@ $RE{'display-name'} = $RE{'phrase'};
$RE{'dot_atom_text'} = qr/$RE{'atext'}+(?:\.$RE{'atext'}+)*/;
$RE{'dot_atom'} = qr/$RE{'cfws'}*$RE{'dot_atom_text'}$RE{'cfws'}*/;
-$RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted_string'}/;
+$RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/;
$RE{'dcontent'} = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
$RE{'domain_literal'} = qr/$RE{'cfws'}*\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}*/;
$RE{'domain'} = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
-$RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
$RE{'addr-spec'} = qr/$RE{'local-part'}\@$RE{'domain'}/;
$RE{'angle-addr'} = qr/$RE{'cfws'}* < $RE{'addr-spec'} > $RE{'cfws'}*/x;
@@ -145,7 +238,11 @@ sub parse {
# in obs- case we have number of optional comments/spaces/
# address-list = (address *("," address)) / obs-addr-list
# obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]))
- $line =~ s/^(?:$RE{'cfws'}?,)+//o;
+ if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) {
+ push @res, {type => 'comment', value => $1 }
+ if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/;
+ next;
+ }
$line =~ s/^\s+//o;
# now it's only comma separated address where address is:
@@ -155,10 +252,13 @@ sub parse {
# group = display-name ":" [group-list] ";" [CFWS]
# group-list = mailbox-list / CFWS / obs-group-list
# obs-group-list = 1*([CFWS] ",") [CFWS])
- if ( !$in_group && $line =~ s/^$RE{'display-name'}://o ) {
+ if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) {
+ push @res, {type => 'group start', value => $1 }
+ unless $args{'skip_groups'};
$in_group = 1; next;
}
if ( $in_group && $line =~ s/^;// ) {
+ push @res, {type => 'group end'} unless $args{'skip_groups'};
$in_group = 0; next;
}
@@ -168,33 +268,43 @@ sub parse {
# so address-list is now comma separated list of mailboxes:
# address-list = (mailbox *("," mailbox))
- if ( $line =~ s/^($CRE{'mailbox'})//o ) {
- my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
- $1,$2,$3,$4,$5,$6,$7
- );
- push @res, Email::Address->new(
- $phrase, "$user\@$host", join(' ', grep defined, @comments),
- $original,
- );
- next;
- }
- elsif ( $line =~ s/^($CRE{'obs-mailbox'})//o ) {
+ my $obsolete = 0;
+ if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o
+ || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1)
+ ) {
my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
- $1,$2,$3,$4,$5,$6,$7
- );
- push @res, Email::Address->new(
- $phrase, "$user\@$host", join(' ', grep defined, @comments),
- $original,
+ $1,$2,$3,$4,$5,$6,$7,$8,$9
);
+ my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0;
+ next if $not_ascii && $args{skip_not_ascii};
+
+ push @res, {
+ type => 'mailbox',
+ value => Email::Address->new(
+ $phrase, "$user\@$host", join(' ', @comments),
+ $original,
+ ),
+ obsolete => $obsolete,
+ not_ascii => $not_ascii,
+ };
next;
}
# if we got here then something unknown on our way
# try to recorver
- if ( $line =~ s/^(.+?)\s*(?=(;)|,|$)//o ) {
- push @res, { type => 'unknown', value => $1 };
- if ($2) { $in_group = 1 }
+ if ($in_group) {
+ if ( $line =~ s/^([^;,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*)*)(?=;|,)//o ) {
+ push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
+ next;
+ }
+ } else {
+ if ( $line =~ s/^([^,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*)*)(?=,)//o ) {
+ push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
+ next;
+ }
}
+ push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'};
+ last;
}
return @res;
}
@@ -217,8 +327,8 @@ sub _process_mailbox {
my @comments;
foreach ( grep defined, splice @rest ) {
- s{ ($RE{'quoted_string'}) | ($RE{comment}) }
- { $1? $1 : do { push @comments, $2; $comments[-1]=~/^\s|\s$/? ' ' : '' } }xgoe;
+ s{ ($RE{'quoted-string'}) | ($RE{comment}) }
+ { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe;
s/^\s+//; s/\s+$//;
next unless length;
@@ -227,13 +337,13 @@ sub _process_mailbox {
my ($host, $user, $phrase) = reverse @rest;
# deal with spaces out of quoted strings
- s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : ' ' }xgoe
+ s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe
foreach grep defined, $phrase;
- s{ ($RE{'quoted_string'}) | \s+ }{ $1? $1 : '' }xgoe
+ s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe
foreach $user, $host;
# dequote
- s{ ($RE{'quoted_string'}) }{ $dequote->($1) }xgoe
+ s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe
foreach grep defined, $phrase, $user;
$user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
diff --git a/t/basics.t b/t/basics.t
index fe27ff4..0cdad23 100644
--- a/t/basics.t
+++ b/t/basics.t
@@ -2,10 +2,9 @@ use strict; use warnings;
use Test::More tests => 3;
use_ok 'Email::Address::List';
-use Scalar::Util qw(blessed);
{
my @addresses = Email::Address::List->parse(q{ruz at bestpractical.com});
is scalar @addresses, 1;
- is $addresses[0]->format, q{ruz at bestpractical.com};
+ is $addresses[0]{'value'}->format, q{ruz at bestpractical.com};
}
diff --git a/t/generate.pl b/t/generate.pl
index 5ce4fa4..dfbc41c 100644
--- a/t/generate.pl
+++ b/t/generate.pl
@@ -1,7 +1,6 @@
use strict; use warnings; use autodie; use lib 'lib/';
use Email::Address::List;
-my $file = 't/data/RFC5233.single.valid.txt';
foreach my $file (qw(t/data/RFC5233.single.valid.txt t/data/RFC5233.single.obs.txt)) {
process_file($file);
}
diff --git a/t/invalid.t b/t/invalid.t
new file mode 100644
index 0000000..4a00639
--- /dev/null
+++ b/t/invalid.t
@@ -0,0 +1,28 @@
+use strict; use warnings;
+use Test::More tests => 4;
+use_ok 'Email::Address::List';
+
+run_test('root', [{type => 'unknown', value => 'root'}]);
+run_test(
+ 'boo at boo, root, foo at foo',
+ [
+ {type => 'mailbox', value => 'boo at boo', obsolete => 0, not_ascii => 0},
+ {type => 'unknown', value => 'root'},
+ {type => 'mailbox', value => 'foo at foo', obsolete => 0, not_ascii => 0},
+ ],
+);
+run_test(
+ '"Doe, John" foo at foo, root',
+ [
+ {type => 'unknown', value => '"Doe, John" foo at foo' },
+ {type => 'unknown', value => 'root'},
+ ],
+);
+
+sub run_test {
+ my $line = shift;
+ my @list = Email::Address::List->parse($line);
+ $_->{'value'} .= '' foreach grep defined $_->{'value'}, @list;
+ is_deeply( \@list, shift );
+}
+
diff --git a/t/random.combinations.t b/t/random.combinations.t
new file mode 100644
index 0000000..1b1b2a4
--- /dev/null
+++ b/t/random.combinations.t
@@ -0,0 +1,60 @@
+use strict;
+use warnings;
+
+use Test::More;
+use JSON ();
+
+use_ok('Email::Address::List');
+
+my @data;
+foreach my $file (qw(t/data/RFC5233.single.valid.json t/data/RFC5233.single.obs.json)) {
+ my $obsolete = $file =~ /\bobs\b/? 1 : 0;
+
+ open my $fh, '<', $file;
+ push @data, @{ JSON->new->decode( do { local $/; <$fh> } ) };
+ close $fh;
+}
+
+diag "srand is ". (my $seed = int rand( 2**16-1 ));
+srand($seed);
+
+for (1..100) {
+ my @list;
+ push @list, $data[ rand @data ] for 1..3;
+
+ my $line = join ', ', map $_->{'mailbox'}, @list;
+ note $line;
+
+ my @res = Email::Address::List->parse( $line );
+ is scalar @res, scalar @list;
+
+ for (my $i = 0; $i < @list; $i++) {
+ my $test = $list[$i];
+ my $v = $res[$i]{'value'};
+ is $v->phrase, $test->{'display-name'}, 'correct value';
+ is $v->address, $test->{'address'}, 'correct value';
+ is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+ }
+}
+
+for (1..100) {
+ my @list;
+ push @list, $data[ rand @data ] for 1..3;
+
+ my $line = join ",\n ,", '', (map $_->{'mailbox'}, @list), '';
+ note $line;
+
+ my @res = Email::Address::List->parse( $line );
+ is scalar @res, scalar @list;
+
+ for (my $i = 0; $i < @list; $i++) {
+ my $test = $list[$i];
+ my $v = $res[$i]{'value'};
+ is $v->phrase, $test->{'display-name'}, 'correct value';
+ is $v->address, $test->{'address'}, 'correct value';
+ is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+ }
+}
+
+done_testing;
+
diff --git a/t/single.suit.t b/t/single.suit.t
new file mode 100644
index 0000000..f00a9d9
--- /dev/null
+++ b/t/single.suit.t
@@ -0,0 +1,28 @@
+use strict;
+use warnings;
+
+use Test::More;
+use JSON ();
+
+use_ok('Email::Address::List');
+
+foreach my $file (qw(t/data/RFC5233.single.valid.json t/data/RFC5233.single.obs.json)) {
+ my $obsolete = $file =~ /\bobs\b/? 1 : 0;
+
+ open my $fh, '<', $file;
+ my $tests = JSON->new->decode( do { local $/; <$fh> } );
+ close $fh;
+
+ foreach my $test ( @$tests ) {
+ note $test->{'description'};
+ my @list = Email::Address::List->parse( $test->{'mailbox'} );
+ is scalar @list, 1, "one entry in result set" or do { use Data::Dumper; diag Dumper \@list };
+ is $list[0]{'type'}, 'mailbox', 'one mailbox';
+ my $v = $list[0]{'value'};
+ is $v->phrase, $test->{'display-name'}, 'correct value';
+ is $v->address, $test->{'address'}, 'correct value';
+ is $v->comment, join( ' ', @{$test->{'comments'}} ), 'correct value';
+ }
+}
+
+done_testing();
-----------------------------------------------------------------------
More information about the Bps-public-commit
mailing list