Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env perl
- # Post-processing for http://data.stackexchange.com/stackoverflow/query/72948/tags-with-similar-names
- # Usage: $0 <QueryResults.csv >hyphens.md 3>plurals.md
- use warnings;
- use strict;
- use IO::Handle;
- use List::Util qw(max min);
- open PLURALS, ">&3" or die $!;
- autoflush STDOUT 1; binmode(STDOUT, ":utf8");
- autoflush PLURALS 1; binmode(PLURALS, ":utf8");
- my %names = ();
- while (<STDIN>) {
- next if $. == 1 && !/^"/;
- unless (/^"([0-9]+)", # count 1
- (?:"[0-9]*","[0-9]*",)? # wiki 1
- "[^\"]*\|([^\"]+)", # tag 1
- "([0-9]+)", # count 2
- (?:"[0-9]*","[0-9]*",)? # wiki 2
- "[^\"]*\|([^\"]+)" # tag 2
- \r?$/x) {
- print STDERR "Ignoring line $.: $_";
- next;
- }
- my ($count1, $tag1, $count2, $tag2) = ($1, $2, $3, $4);
- my $key = $tag1; $key =~ s/s?-|s$//g;
- $names{$key}{$tag1} = $count1;
- $names{$key}{$tag2} = $count2;
- }
- foreach my $key (map {$_->[1]} sort {$b->[0] <=> $a->[0]}
- map {[max(values %{$names{$_}}), $_]} keys %names) {
- my $line = join("\xa0\xa0\xa0\xa0",
- sort {"$a$b" =~ /\(([0-9]+)\).*\(([0-9]+)\)/;
- $2 <=> $1}
- map {"[tag:$_] ($names{$key}{$_})"}
- keys %{$names{$key}}) . " \n";
- my %subbuckets;
- foreach my $name (keys %{$names{$key}}) {
- $name =~ s/-//g;
- ++$subbuckets{$name};
- }
- if (keys(%subbuckets) == 1) {
- print $line;
- } else {
- print PLURALS $line;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement