forvo-zip2dsl.py

#!/usr/bin/env python3

import sys
import argparse
import zipfile
import re
from urllib.parse import unquote
from collections import defaultdict

'''
Changelog
  2023-02-20
    - assorted minor code cleanups
  2022-05-12
    - minor fix: escape ~<>{}#@^ everywhere, not just in headwords
  2022-04-17
    - contents_lang is now optional: there is very little reason to set it
      to anything other than English anyway;
    - two new command line options have been introduced:
      '-s <sub_file>' loads a (zip-specific) set of regex>>replacement pairs;
         for examples of how this works, see the comment at the bottom;
      '-p' normalizes headwords ending in a period or a comma:
         'some phrase[.,]' gets merged into 'some phrase' *IF* the latter
         already exists;
    - file lists are now accepted as well as actual .zip files,
      mainly so that I don't have to keep around all those huge .ZIPs

  2022-04-05
    numerous minor fixes: handle URL-encoded sequences, weed out all kinds of
    control characters and other non-printable garbage, trim whitespace
'''


class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
                      argparse.RawDescriptionHelpFormatter):
    pass
parser = argparse.ArgumentParser(
        description='DSL file generator for Forvo/LinguaLibre datasets',
        formatter_class=CustomFormatter,
        allow_abbrev=False,
        epilog='''usage examples:
  %(prog)s -v -p zh.zip 'Forvo Mandarin' Chinese >ForvoMandarin.dsl
  %(prog)s -v -p -s ForvoDutch.sub nl.zip 'Forvo Dutch' Dutch >ForvoDutch.dsl
''')

parser.add_argument('input_file', help='an actual zip file or a text file listing its contents')
parser.add_argument('dict_name', help='dictionary name (#NAME)')
parser.add_argument('index_lang', help='#INDEX_LANGUAGE')
parser.add_argument('contents_lang', nargs='?', default='English',
        help='#CONTENTS_LANGUAGE')
parser.add_argument('-v', '--verbose', action='store_true', help='verbose mode')
parser.add_argument('-s', '--sub-file', metavar='SUB_FILE', action='append',
        help='load pattern>>replacement pairs from SUB_FILE; you can stack together as many .sub files as you want: -s file1.sub -s file2.sub...')
parser.add_argument('-p', '--merge-period', action='store_true',
        help="normalize headwords ending in a period or a comma: 'some phrase[.,]' gets merged into 'some phrase' *IF* the latter already exists; this does not affect single-word abbreviations such as etc./ca./ibid.")

args = parser.parse_args()


class TextSub:
    def __init__(self, *args):
        self.subs = []
        for a in args:
            if isinstance(a, str):
                self.load_file(a)
            else:
                pat, repl = a
                self.add_sub(pat, repl, re.IGNORECASE)
    def add_sub(self, pat, repl, flags=0):
        self.subs.append( (re.compile(pat, flags), repl) )
    def load_file(self, filename):
        with open(filename) as fp:
            for line in fp:
                if line[0] == '#' or line.isspace():
                    continue
                pat, sep, repl = line.rstrip('\r\n').partition('>>')
                if sep != '>>':
                    raise ValueError(f'{filename}: malformed regex>>repl '
                                     f'line: {line}')
                self.add_sub(pat, repl, re.IGNORECASE)
    def __call__(self, s):
        for pat, repl in self.subs:
            s = pat.sub(repl, s)
        return s

if args.sub_file:
    normalizer = TextSub(*args.sub_file)
else:
    normalizer = None


del_garbage = str.maketrans('\u00a0', ' ',
    '\u0010\u007f\u0080\u0085\u0086\u0088\u008a\u008d\u008f\u0090\u0092\u0093'
    '\u200b\u200c\u200d\u200e\u200f\u202a\u202b\u202c\u202d\u202e\u206f\ufeff')
esc_headword = str.maketrans({ c : '\\'+c for c in r'\[](){}<>#@~^' })
# {} and <> need to be escaped too: {{ comment }}, <<ref>>
esc_body = str.maketrans({ c : '\\'+c for c in r'\[]{}<>#@~^' })


def get_zip_contents(in_file):
    if in_file.lower().endswith('.zip'):
        with zipfile.ZipFile(in_file) as zip_ref:
            yield from zip_ref.namelist()
    else:
        with open(in_file) as fp:
            for line in fp:
                yield line.rstrip('\r\n')

words = defaultdict(list)
nclips = 0
for filename in get_zip_contents(args.input_file):
    if filename[-1] == '/':
        continue
    ext_idx = filename.rindex('.')
    word_idx = filename.rindex('/', 0, ext_idx)
    # even if there is no slash, this will still work as expected
    username_idx = filename.rfind('/', 0, word_idx)
    username = filename[username_idx+1:word_idx]
    word = unquote(filename[word_idx+1:ext_idx]).translate(del_garbage).strip()
    if normalizer:
        word = normalizer(word)
    if word:
        words[word].append( (filename, username) )
        nclips += 1
    else:
        print(f'{args.dict_name}: empty headword: {filename}', file=sys.stderr)

if args.merge_period:
    period_comma = []
    for w in words:
        last = w[-1]
        if last in ',。，' or last == '.' and ' ' in w:
            period_comma.append(w)
    nmerged = 0
    # reverse-sort the list to account for weird scenarios such as
    #   blah.. -> blah. -> blah
    period_comma.sort(reverse=True)
    for w in period_comma:
        parent = words.get(w[:-1])
        if parent is not None:
            parent.extend(words.pop(w))
            nmerged += 1

if args.verbose:
    merged_msg = f' ({nmerged:,} merged)' if args.merge_period else ''
    print(f'{args.dict_name}: {nclips:,} sound clips in '
          f'{len(words):,} articles{merged_msg}', file=sys.stderr)

escaped = { k.translate(esc_headword) : v for k, v in words.items() }

print(
    '\ufeff'
   f'#NAME "{args.dict_name}"\n'
   f'#INDEX_LANGUAGE "{args.index_lang}"\n'
   f'#CONTENTS_LANGUAGE "{args.contents_lang}"\n')

for headword, snds in sorted(escaped.items()):
    print(headword)
    for filename, username in sorted(snds, key=lambda item: item[1].lower()):
        print(f'\t[m1][s]{filename.translate(esc_body)}[/s] '
              f'[c slategray][i]{username.translate(esc_body)}[/i][/c][/m]')


'''
### .sub file examples ###
# syntax:
pattern1>>replacement1
pattern2>>replacement2
...
# empty lines and lines beginning with '#' are ignored;
# if you need to put '#' at the very beginning of your pattern,
# just use [#] or \# instead; also, make sure the pattern neither contains two
# '>' characters in a row nor ends with a trailing '>' (use [>] as appropriate);
# the 'replacement' part may contain arbitrary text


### normalize-whitespace.sub ###
# collapse all consecutive whitespace into a single space;
# note that the following line ends with a SPACE character:
\s+>>
### end of normalize-whitespace.sub ###


### ForvoDutch.sub ###
\s+>>
(?<!^),$>>
 \.$>>.
,([a-z])>>, \1

# 's, 't, zo'n, z'n
[’ʼ´`‘"]([stn])\b>>'\1
# d'r, Lieven D'hulst, rouge de l'ouest
\b([dl])[’ʼ´`‘"]>>\1'
ĳ>>ij
### end of ForvoDutch.sub ###


### ForvoEnglish.sub ###
(?<!^),$>>
 \.$>>.
([a-z]),([a-z])>>\1, \2
ﬁ>>fi
ﬂ>>fl
[’ʼ´`‘"′]([dmst]|ll|re|ve|tis|twas|em)\b>>'\1
# o'clock / O'Shea / y'all / y'know
\b([oy])’(?=[a-z])>>\1'
# ma'am / ma'
\b(ma)’\b>>\1'
# '90s
’([0-9]0s)\b>>'\1
# bangin'
([a-z]{2,}in)’(?![a-z])>>\1'
# e'er / ne'er / o'er
\b(e|o|ne)’(er)\b>>\1'\2
# nine days' wonder, farmers' market
# this sometimes produces false positives:
#([a-z]{2,}s)’(?=\s+[a-z])>>\1'
# and this does not allow for more than one match per line:
^([^‘]*?[a-z]{2,}s)’(?=\s+[a-z])>>\1'

#\bhis ‘helpful hints' about\b>>his ‘helpful hints’ about
#\bhumbling of the ‘queen of the sciences' is\b>>humbling of the ‘queen of the sciences’ is
#\bart of ‘righteous' slander\b>>art of ‘righteous’ slander

"it is the principle of the thing\.’$>>"it is the principle of the thing."
\bain’tcha\b>>ain'tcha
\banderson \.paak\b>>anderson paak
\bdeandre’ bembry\b>>deandre' bembry
\bde’ath\b>>de'ath
\bd’(amalfi|entrecasteaux|urbervilles)\b>>d'\1
\bentr’acte\b>>entr'acte
\bg’night\b>>g'night
\bi knew you were right all along \. never doubted you for second\.$>>i knew you were right all along. never doubted you for a second.
\bjohn o’ groats\b>>john o' groats
\blupita nyong’o\b>>lupita nyong'o
\bm’culloch\b>>m'culloch
\boshkosh b’gosh\b>>oshkosh b'gosh
\bsweet *['’]n low\b>>sweet'n low
^‘'tis cypher lies beneath\b>>'tis cypher lies beneath
### end of ForvoEnglish.sub ###


### ForvoFrench.sub ###
\b([cdjlmnstç]|qu|jusqu|lorsqu|puisqu|presqu|quelqu|aujourd)[’ʼ´`‘"]>>\1'
# WTF is i’humidité or i'asie
\bi['’´]([aeiouéèh])>>l'\1
[’`‘]>>'
‐>>-
 , >>,
\. \. \.>>...
 \.$>>.
(?<!^),$>>
 t-il>>-t-il
# doux d'Espagne
\bd ([ae])>>d'\1
# y a-t-il
\by-a-t>>y a-t
\baujourd hui\b>>aujourd'hui
\bemmenez moi\b>>emmenez-moi
\bl '>>l'
^l >>il
^a (l'|la|bientôt|très bientôt|demain|plus|mesure|marée basse|quoi bon|partir|bas|bout|ce soir|consommer|mon|son|point|vous|nous|toi|vos|tes|table|quelle heure|combien|gauche)\b>>à \1
# convert back: a l'air (sympa) / à l'air libre
^à l'air$>>a l'air
# convert back: a l'intention / à l'intention de (qqn)
^à l'intention$>>a l'intention
\btu a\b>>tu as
\bca\b(?!\.$)>>ça
\bapr[eé]s\b>>après
\b(baptist|berg|boni|brugui|carr|carri|cimeti|courri|cuisini|elzi|f|ferr|ferri|fremi|fr|fresni|fureti|gibeci|goug|gravi|grenouilli|laudoni|lecl|lotbini|lumi|massi|meissoni|messali|orni|rivi|salonni|truch|vassi|vell)ere\b>>\1ère
\b(bouch|bussi|chevali|coug|deshouli|eygali|favi|goug|houli|humi|joncqui|mazi|palli|perri|peyri|savenni|serreudi)eres\b>>\1ères
\bgrandm[eè]re\b>>grand-mère
vayssìere>>vayssière
\btaillefere\b>>tailleferre
\byvresse legere\b>>ivresse légère
\betre\b>>être
saint germain des pres>>saint-germain-des-prés
quelque-chose>>quelque chose
a-peu-près>>à-peu-près
\ba (paris|cannes|castelreng|bouzy|croire|grignoter|présent)\b>>à \1
\b(parler|demander) a\b>>\1 à
\ba l('école|'arrache|'aise|a cave)\b>>à l\1
\b[aà] [eé]milie\b>>à émilie
\bà beau mentir>>a beau mentir
\btout a fait\b>>tout à fait
\bquelle dommage\b>>quel dommage
\b(gratin|ann|chamois|cuv|gicl|sens)ee\b>>\1ée
\bmusee>>musée
\brandonee\b>>randonnée
\bepee\b>>épée
\babimee\b>>abîmée
\bmarne la vallee\b>>marne-la-vallée
\bnous chargons\b>>nous chargeons
\bje mangais\b>>je mangeais
\bombragaient\b>>ombrageaient
\bcontinuerent\b>>continuèrent
\bangoul[eè]me\b>>angoulême
\b(cr|3|6)eme\b>>\1ème
\b(deux|trois|quatr)iemes\b>>\1ièmes
\bpraticant\b>>pratiquant
\bsamuel sorbiére\b>>samuel sorbière
\ben règle génére\b>>en règle générale
\breglelmantee\b>>réglementée
\bmille tonneres\b>>mille tonnerres
",original=">>""
,([a-zéèêàç])>>, \1

\bqu'est-ce qu-on fait\b>>qu'est-ce qu'on fait
\bqeulqu'un\b>>quelqu'un
\bje serre ia main\b>>je serre la main
\bsi'l vous plaît\b>>s'il vous plaît
# what on earth is this
\babédamebondiou l'étiant pourtant\b>>abédamebondiou i'étiant pourtant
\bau volant　de\b>>au volant de
\btenir de quelq'un\b>>tenir de quelqu'un
\bdirac´h\b>>dirac'h
\btiens, voila justement ma'ame baptieret\b>>tiens, voilà justement madame baptieret
\bvoila\b>>voilà
\bj'agis toujours a mon gré\b>>j'agis toujours à mon gré
\bquelle date sommes-nous aujourd'hui ？>>quelle date sommes-nous aujourd'hui ?
\bil peut y a voir plusieurs\b>>il peut y avoir plusieurs
\bil abattit l'arbre a coups de hache\b>>il abattit l'arbre à coups de hache
\bplein de vie et trés amical avec l'homme\b>>plein de vie et très amical avec l'homme
\bavoir été bercé un peu prés du mur\b>>avoir été bercé un peu près du mur
\bnous avons pris un autobus et nous sommes allés a quelque kilomètres d'alger\b>>nous avons pris un autobus et nous sommes allés à quelques kilomètres d'alger
\bavoir une idée derrière une tête\b>>avoir une idée derrière la tête
\bil n'y a a pas lieu de\b>>il n'y a pas lieu de
\bappellation originale controlée\b>>appellation originale contrôlée
\bune chaîne d'information en continu}} qui\b>>une chaîne d'information en continu qui
\bceux-la\b>>ceux-là
\bcelui-la\b>>celui-là
\bgressoney saint-jean\b>>gressoney-saint-jean
\bgressoney-la trinité\b>>gressoney-la-trinité
\bmom petit coeur\b>>mon petit coeur
### end of ForvoFrench.sub ###
'''