Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- use strict;
- use warnings;
- use utf8;
- use 5.012;
- sub parse_email {
- my $email = shift;
- my $pos = rindex($email, "@");
- return (substr($email, 0, $pos), substr($email, $pos + 1));
- }
- sub is_gmail {
- my $domain = shift;
- state $gmail_domain = {
- 'gmail.com' => 1,
- 'googlemail.com' => 1,
- };
- return $gmail_domain->{$domain};
- }
- sub aggregated_gmail_account {
- my $account = shift;
- # gmailは大文字、小文字を区別しない
- # gmailは[a-z0-9\.]{6-30} というのがルール +.. などは捨てる
- my ($base) = (lc($account) =~ /^([a-z0-9\.]+)/);
- unless ($base) {
- return "";
- }
- # gmailは.の有無は同一のものとみなす
- $base =~ s/\.//g;
- return $base;
- }
- sub is_yahoomail {
- my $domain = shift;
- state $yahoo_domain = {
- 'yahoo.co.jp' => 1,
- };
- return $yahoo_domain->{$domain};
- }
- sub aggregated_yahoo_account {
- my $account = shift;
- # yahooはセーフティーアドレスという<basename>-<freeword> を設定できる
- # メインアカウントとセーフティアドレスのbasenameの名寄せは判定できないが、2アドレス以上はガードできる
- my ($base) = (lc($account) =~ /^([a-z0-9\_]+)/);
- unless ($base) {
- return "";
- }
- return $base;
- }
- sub aggregation {
- my $email = shift;
- my ($account, $domain) = parse_email($email);
- if (is_gmail($domain)) {
- return aggregated_gmail_account($account) . '@gmail.com';
- }
- if (is_yahoomail($domain)) {
- return aggregated_yahoo_account($account) . '@yahoo.co.jp';
- }
- return $email;
- }
- my $count = {};
- my $lines = 0;
- open my $fh, "<", $ARGV[0];
- while (my $line = <$fh>) {
- chomp $line;
- ++$lines;
- print STDERR "done $lines\n" if $lines % 100000 == 1;
- my ($person, $email) = split "\t", $line;
- next if $person eq 'person'; # header
- my ($account, $domain) = parse_email($email);
- my $aggregated;
- if (is_gmail($domain)) {
- if (my $a = aggregated_gmail_account($account)) {
- $aggregated = $a . '@gmail.com';
- }
- else {
- print STDERR "failed. $person, $email\n";
- }
- }
- elsif (is_yahoomail($domain)) {
- if (my $a = aggregated_yahoo_account($account)) {
- $aggregated = $a . '@yahoo.co.jp';
- }
- else {
- print STDERR "failed. $person, $email\n";
- }
- }
- next unless $aggregated;
- $count->{$aggregated} //= 0;
- $count->{$aggregated}++;
- }
- close $fh;
- print STDERR "end read\n";
- for my $email (keys %$count) {
- my $c = $count->{$email};
- next if $c <= 1;
- printf "%s\t%d\n", $email, $c;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement