Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use strict;
- use Parse::MediaWikiDump;
- use Text::MediawikiFormat as => 'wiki2html';
- use Time::Piece;
- use Unicode::Normalize;
- no warnings 'utf8';
- # Get a list of musicals
- my @musicals_tmp = category_members('Broadway musicals');
- # Remove non-alphas from musicals
- my %musicals;
- my %hash_musicals;
- foreach my $musical (@musicals_tmp)
- {
- #print "$musical\n";
- $musical =~ s/[^A-Za-z ]//g;
- $musicals{lc $musical} = 1;
- $hash_musicals{word2num($musical)} = lc $musical;
- }
- #die;
- # Read in famous names
- my @names;
- my %last_to_first;
- open FILE, 'FamousNames.txt' or die $!;
- while (<FILE>)
- {
- chomp;
- my $name = lc $_;
- my $score = 100;
- if ($name =~ /^(.*)\t(.*)$/) {$name = $1;$score = $2;}
- next unless $score >= 90;
- my $firstname; my $lastname;
- if ($name =~ /^([^ ]+) ([^ ]+)$/) {$firstname = $1; $lastname = $2;}
- next unless $firstname;
- push(@names,"$firstname $lastname");
- push(@{$last_to_first{$lastname}},$firstname);
- }
- close FILE;
- # Go through the names and pull out anyone with the same first name as a musical
- my @musical_names;
- foreach my $n (@names)
- {
- my ($fn, $ln) = split(/ /,$n);
- push(@musical_names,$n) if $musicals{$fn};
- }
- foreach my $fullname (@musical_names)
- {
- my ($fn, $ln) = split(/ /,$fullname);
- #print " $fn\n";
- # Find all others with the same last name
- my $last_people_ref = $last_to_first{$ln};
- # Check if any of these people have a first name that anagrams to a musical
- foreach my $new_fn (@$last_people_ref)
- {
- next if $fn eq $new_fn;
- #print " $new_fn\n";
- if ($hash_musicals{word2num($new_fn)} && $hash_musicals{word2num($new_fn)} ne $new_fn)
- {
- print "$fn and $new_fn $ln => " . $hash_musicals{word2num($new_fn)} . "\n";
- }
- }
- }
- ######
- # SUBS
- ######
- sub category_members
- {
- use JSON;
- use LWP::Simple;
- my ($category) = @_;
- my @category_members;
- $category =~ s/ /_/g;
- my $cmcontinue = "";
- do
- {
- my $url = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:$category&cmlimit=500&format=json&cmnamespace=0&cmtype=page&cmcontinue=$cmcontinue";
- my $json = get($url);
- my $href = decode_json($json);
- my @arr = @{$href->{'query'}->{'categorymembers'}};
- foreach my $a (@arr)
- {
- my $title = $a->{'title'};
- $title = remove_diacritics($title);
- if ($title =~ /^(.*) \([^\)]+\)$/) {$title = $1;}
- push(@category_members,$title);
- }
- # Decide whether to continue
- if ($href->{'query-continue'}->{'categorymembers'}->{'cmcontinue'})
- {
- $cmcontinue = $href->{'query-continue'}->{'categorymembers'}->{'cmcontinue'};
- sleep(1); # Be nice to Wikipedia
- }
- else {$cmcontinue = "";}
- } while $cmcontinue;
- return @category_members;
- }
- # Remove diacritics (from a title)
- sub remove_diacritics
- {
- my $w = NFD(shift);
- $w =~ s/\pM//g;
- return $w;
- }
- sub word2num
- {
- # Given a word, turn it into an order-independent hash
- # Convert the word to ALL CAPS
- my $w = uc shift;
- # Remove any non-alphas
- $w =~ s/[^A-Z]//g;
- my %convert = qw(A 2 B 3 C 5 D 7 E 11 F 13 G 17 H 19 I 23 J 29 K 31 L 37 M 41 N 43 O 47 P 53 Q 59 R 61 S 67 T 71 U 73 V 79 W 83 X 89 Y 97 Z 101);
- my $val = 1;
- foreach my $l (split(//,$w))
- {
- next unless $convert{$l};
- $val *= $convert{$l};
- }
- return $val;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement