Advertisement
Guest User

Untitled

a guest
Feb 22nd, 2019
541
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.64 KB | None | 0 0
  1. use strict;
  2. use warnings;
  3. use utf8;
  4. use 5.012;
  5.  
  6. sub parse_email {
  7. my $email = shift;
  8. my $pos = rindex($email, "@");
  9.  
  10. return (substr($email, 0, $pos), substr($email, $pos + 1));
  11. }
  12.  
  13. sub is_gmail {
  14. my $domain = shift;
  15. state $gmail_domain = {
  16. 'gmail.com' => 1,
  17. 'googlemail.com' => 1,
  18. };
  19. return $gmail_domain->{$domain};
  20. }
  21.  
  22. sub aggregated_gmail_account {
  23. my $account = shift;
  24.  
  25. # gmailは大文字、小文字を区別しない
  26. # gmailは[a-z0-9\.]{6-30} というのがルール +.. などは捨てる
  27. my ($base) = (lc($account) =~ /^([a-z0-9\.]+)/);
  28.  
  29. unless ($base) {
  30. return "";
  31. }
  32.  
  33. # gmailは.の有無は同一のものとみなす
  34. $base =~ s/\.//g;
  35. return $base;
  36. }
  37.  
  38. sub is_yahoomail {
  39. my $domain = shift;
  40. state $yahoo_domain = {
  41. 'yahoo.co.jp' => 1,
  42. };
  43. return $yahoo_domain->{$domain};
  44. }
  45.  
  46. sub aggregated_yahoo_account {
  47. my $account = shift;
  48. # yahooはセーフティーアドレスという<basename>-<freeword> を設定できる
  49. # メインアカウントとセーフティアドレスのbasenameの名寄せは判定できないが、2アドレス以上はガードできる
  50. my ($base) = (lc($account) =~ /^([a-z0-9\_]+)/);
  51. unless ($base) {
  52. return "";
  53. }
  54. return $base;
  55.  
  56. }
  57.  
  58. sub aggregation {
  59. my $email = shift;
  60.  
  61. my ($account, $domain) = parse_email($email);
  62.  
  63. if (is_gmail($domain)) {
  64. return aggregated_gmail_account($account) . '@gmail.com';
  65. }
  66.  
  67. if (is_yahoomail($domain)) {
  68. return aggregated_yahoo_account($account) . '@yahoo.co.jp';
  69. }
  70.  
  71. return $email;
  72. }
  73.  
  74. my $count = {};
  75. my $lines = 0;
  76.  
  77. open my $fh, "<", $ARGV[0];
  78. while (my $line = <$fh>) {
  79. chomp $line;
  80. ++$lines;
  81. print STDERR "done $lines\n" if $lines % 100000 == 1;
  82. my ($person, $email) = split "\t", $line;
  83. next if $person eq 'person'; # header
  84.  
  85. my ($account, $domain) = parse_email($email);
  86.  
  87. my $aggregated;
  88. if (is_gmail($domain)) {
  89. if (my $a = aggregated_gmail_account($account)) {
  90. $aggregated = $a . '@gmail.com';
  91. }
  92. else {
  93. print STDERR "failed. $person, $email\n";
  94. }
  95. }
  96. elsif (is_yahoomail($domain)) {
  97. if (my $a = aggregated_yahoo_account($account)) {
  98. $aggregated = $a . '@yahoo.co.jp';
  99. }
  100. else {
  101. print STDERR "failed. $person, $email\n";
  102. }
  103.  
  104. }
  105. next unless $aggregated;
  106.  
  107. $count->{$aggregated} //= 0;
  108. $count->{$aggregated}++;
  109. }
  110. close $fh;
  111.  
  112. print STDERR "end read\n";
  113.  
  114. for my $email (keys %$count) {
  115. my $c = $count->{$email};
  116. next if $c <= 1;
  117. printf "%s\t%d\n", $email, $c;
  118. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement