Advertisement
stackexchange-gilles

sede-query-72948-to-markdown

Aug 18th, 2012
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 1.57 KB | None | 0 0
  1. #! /usr/bin/env perl
  2. # Post-processing for http://data.stackexchange.com/stackoverflow/query/72948/tags-with-similar-names
  3. # Usage: $0 <QueryResults.csv >hyphens.md 3>plurals.md
  4. use warnings;
  5. use strict;
  6. use IO::Handle;
  7. use List::Util qw(max min);
  8. open PLURALS, ">&3" or die $!;
  9. autoflush STDOUT 1; binmode(STDOUT, ":utf8");
  10. autoflush PLURALS 1; binmode(PLURALS, ":utf8");
  11. my %names = ();
  12. while (<STDIN>) {
  13.     next if $. == 1 && !/^"/;
  14.    unless (/^"([0-9]+)", # count 1
  15.              (?:"[0-9]*","[0-9]*",)? # wiki 1
  16.              "[^\"]*\|([^\"]+)", # tag 1
  17.               "([0-9]+)", # count 2
  18.               (?:"[0-9]*","[0-9]*",)? # wiki 2
  19.               "[^\"]*\|([^\"]+)" # tag 2
  20.               \r?$/x) {
  21.         print STDERR "Ignoring line $.: $_";
  22.         next;
  23.     }
  24.     my ($count1, $tag1, $count2, $tag2) = ($1, $2, $3, $4);
  25.     my $key = $tag1; $key =~ s/s?-|s$//g;
  26.     $names{$key}{$tag1} = $count1;
  27.     $names{$key}{$tag2} = $count2;
  28. }
  29. foreach my $key (map {$_->[1]} sort {$b->[0] <=> $a->[0]}
  30.                  map {[max(values %{$names{$_}}), $_]} keys %names) {
  31.     my $line = join("\xa0\xa0\xa0\xa0",
  32.                     sort {"$a$b" =~ /\(([0-9]+)\).*\(([0-9]+)\)/;
  33.                           $2 <=> $1}
  34.                     map {"[tag:$_] ($names{$key}{$_})"}
  35.                     keys %{$names{$key}}) . "  \n";
  36.     my %subbuckets;
  37.     foreach my $name (keys %{$names{$key}}) {
  38.         $name =~ s/-//g;
  39.         ++$subbuckets{$name};
  40.     }
  41.     if (keys(%subbuckets) == 1) {
  42.         print $line;
  43.     } else {
  44.         print PLURALS $line;
  45.     }
  46. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement