Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import sys
- import argparse
- import zipfile
- import re
- from urllib.parse import unquote
- from collections import defaultdict
- '''
- Changelog
- 2023-02-20
- - assorted minor code cleanups
- 2022-05-12
- - minor fix: escape ~<>{}#@^ everywhere, not just in headwords
- 2022-04-17
- - contents_lang is now optional: there is very little reason to set it
- to anything other than English anyway;
- - two new command line options have been introduced:
- '-s <sub_file>' loads a (zip-specific) set of regex>>replacement pairs;
- for examples of how this works, see the comment at the bottom;
- '-p' normalizes headwords ending in a period or a comma:
- 'some phrase[.,]' gets merged into 'some phrase' *IF* the latter
- already exists;
- - file lists are now accepted as well as actual .zip files,
- mainly so that I don't have to keep around all those huge .ZIPs
- 2022-04-05
- numerous minor fixes: handle URL-encoded sequences, weed out all kinds of
- control characters and other non-printable garbage, trim whitespace
- '''
- class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
- argparse.RawDescriptionHelpFormatter):
- pass
- parser = argparse.ArgumentParser(
- description='DSL file generator for Forvo/LinguaLibre datasets',
- formatter_class=CustomFormatter,
- allow_abbrev=False,
- epilog='''usage examples:
- %(prog)s -v -p zh.zip 'Forvo Mandarin' Chinese >ForvoMandarin.dsl
- %(prog)s -v -p -s ForvoDutch.sub nl.zip 'Forvo Dutch' Dutch >ForvoDutch.dsl
- ''')
- parser.add_argument('input_file', help='an actual zip file or a text file listing its contents')
- parser.add_argument('dict_name', help='dictionary name (#NAME)')
- parser.add_argument('index_lang', help='#INDEX_LANGUAGE')
- parser.add_argument('contents_lang', nargs='?', default='English',
- help='#CONTENTS_LANGUAGE')
- parser.add_argument('-v', '--verbose', action='store_true', help='verbose mode')
- parser.add_argument('-s', '--sub-file', metavar='SUB_FILE', action='append',
- help='load pattern>>replacement pairs from SUB_FILE; you can stack together as many .sub files as you want: -s file1.sub -s file2.sub...')
- parser.add_argument('-p', '--merge-period', action='store_true',
- help="normalize headwords ending in a period or a comma: 'some phrase[.,]' gets merged into 'some phrase' *IF* the latter already exists; this does not affect single-word abbreviations such as etc./ca./ibid.")
- args = parser.parse_args()
- class TextSub:
- def __init__(self, *args):
- self.subs = []
- for a in args:
- if isinstance(a, str):
- self.load_file(a)
- else:
- pat, repl = a
- self.add_sub(pat, repl, re.IGNORECASE)
- def add_sub(self, pat, repl, flags=0):
- self.subs.append( (re.compile(pat, flags), repl) )
- def load_file(self, filename):
- with open(filename) as fp:
- for line in fp:
- if line[0] == '#' or line.isspace():
- continue
- pat, sep, repl = line.rstrip('\r\n').partition('>>')
- if sep != '>>':
- raise ValueError(f'{filename}: malformed regex>>repl '
- f'line: {line}')
- self.add_sub(pat, repl, re.IGNORECASE)
- def __call__(self, s):
- for pat, repl in self.subs:
- s = pat.sub(repl, s)
- return s
- if args.sub_file:
- normalizer = TextSub(*args.sub_file)
- else:
- normalizer = None
- del_garbage = str.maketrans('\u00a0', ' ',
- '\u0010\u007f\u0080\u0085\u0086\u0088\u008a\u008d\u008f\u0090\u0092\u0093'
- '\u200b\u200c\u200d\u200e\u200f\u202a\u202b\u202c\u202d\u202e\u206f\ufeff')
- esc_headword = str.maketrans({ c : '\\'+c for c in r'\[](){}<>#@~^' })
- # {} and <> need to be escaped too: {{ comment }}, <<ref>>
- esc_body = str.maketrans({ c : '\\'+c for c in r'\[]{}<>#@~^' })
- def get_zip_contents(in_file):
- if in_file.lower().endswith('.zip'):
- with zipfile.ZipFile(in_file) as zip_ref:
- yield from zip_ref.namelist()
- else:
- with open(in_file) as fp:
- for line in fp:
- yield line.rstrip('\r\n')
- words = defaultdict(list)
- nclips = 0
- for filename in get_zip_contents(args.input_file):
- if filename[-1] == '/':
- continue
- ext_idx = filename.rindex('.')
- word_idx = filename.rindex('/', 0, ext_idx)
- # even if there is no slash, this will still work as expected
- username_idx = filename.rfind('/', 0, word_idx)
- username = filename[username_idx+1:word_idx]
- word = unquote(filename[word_idx+1:ext_idx]).translate(del_garbage).strip()
- if normalizer:
- word = normalizer(word)
- if word:
- words[word].append( (filename, username) )
- nclips += 1
- else:
- print(f'{args.dict_name}: empty headword: {filename}', file=sys.stderr)
- if args.merge_period:
- period_comma = []
- for w in words:
- last = w[-1]
- if last in ',。,' or last == '.' and ' ' in w:
- period_comma.append(w)
- nmerged = 0
- # reverse-sort the list to account for weird scenarios such as
- # blah.. -> blah. -> blah
- period_comma.sort(reverse=True)
- for w in period_comma:
- parent = words.get(w[:-1])
- if parent is not None:
- parent.extend(words.pop(w))
- nmerged += 1
- if args.verbose:
- merged_msg = f' ({nmerged:,} merged)' if args.merge_period else ''
- print(f'{args.dict_name}: {nclips:,} sound clips in '
- f'{len(words):,} articles{merged_msg}', file=sys.stderr)
- escaped = { k.translate(esc_headword) : v for k, v in words.items() }
- print(
- '\ufeff'
- f'#NAME "{args.dict_name}"\n'
- f'#INDEX_LANGUAGE "{args.index_lang}"\n'
- f'#CONTENTS_LANGUAGE "{args.contents_lang}"\n')
- for headword, snds in sorted(escaped.items()):
- print(headword)
- for filename, username in sorted(snds, key=lambda item: item[1].lower()):
- print(f'\t[m1][s]{filename.translate(esc_body)}[/s] '
- f'[c slategray][i]{username.translate(esc_body)}[/i][/c][/m]')
- '''
- ### .sub file examples ###
- # syntax:
- pattern1>>replacement1
- pattern2>>replacement2
- ...
- # empty lines and lines beginning with '#' are ignored;
- # if you need to put '#' at the very beginning of your pattern,
- # just use [#] or \# instead; also, make sure the pattern neither contains two
- # '>' characters in a row nor ends with a trailing '>' (use [>] as appropriate);
- # the 'replacement' part may contain arbitrary text
- ### normalize-whitespace.sub ###
- # collapse all consecutive whitespace into a single space;
- # note that the following line ends with a SPACE character:
- \s+>>
- ### end of normalize-whitespace.sub ###
- ### ForvoDutch.sub ###
- \s+>>
- (?<!^),$>>
- \.$>>.
- ,([a-z])>>, \1
- # 's, 't, zo'n, z'n
- [’ʼ´`‘"]([stn])\b>>'\1
- # d'r, Lieven D'hulst, rouge de l'ouest
- \b([dl])[’ʼ´`‘"]>>\1'
- ij>>ij
- ### end of ForvoDutch.sub ###
- ### ForvoEnglish.sub ###
- (?<!^),$>>
- \.$>>.
- ([a-z]),([a-z])>>\1, \2
- fi>>fi
- fl>>fl
- [’ʼ´`‘"′]([dmst]|ll|re|ve|tis|twas|em)\b>>'\1
- # o'clock / O'Shea / y'all / y'know
- \b([oy])’(?=[a-z])>>\1'
- # ma'am / ma'
- \b(ma)’\b>>\1'
- # '90s
- ’([0-9]0s)\b>>'\1
- # bangin'
- ([a-z]{2,}in)’(?![a-z])>>\1'
- # e'er / ne'er / o'er
- \b(e|o|ne)’(er)\b>>\1'\2
- # nine days' wonder, farmers' market
- # this sometimes produces false positives:
- #([a-z]{2,}s)’(?=\s+[a-z])>>\1'
- # and this does not allow for more than one match per line:
- ^([^‘]*?[a-z]{2,}s)’(?=\s+[a-z])>>\1'
- #\bhis ‘helpful hints' about\b>>his ‘helpful hints’ about
- #\bhumbling of the ‘queen of the sciences' is\b>>humbling of the ‘queen of the sciences’ is
- #\bart of ‘righteous' slander\b>>art of ‘righteous’ slander
- "it is the principle of the thing\.’$>>"it is the principle of the thing."
- \bain’tcha\b>>ain'tcha
- \banderson \.paak\b>>anderson paak
- \bdeandre’ bembry\b>>deandre' bembry
- \bde’ath\b>>de'ath
- \bd’(amalfi|entrecasteaux|urbervilles)\b>>d'\1
- \bentr’acte\b>>entr'acte
- \bg’night\b>>g'night
- \bi knew you were right all along \. never doubted you for second\.$>>i knew you were right all along. never doubted you for a second.
- \bjohn o’ groats\b>>john o' groats
- \blupita nyong’o\b>>lupita nyong'o
- \bm’culloch\b>>m'culloch
- \boshkosh b’gosh\b>>oshkosh b'gosh
- \bsweet *['’]n low\b>>sweet'n low
- ^‘'tis cypher lies beneath\b>>'tis cypher lies beneath
- ### end of ForvoEnglish.sub ###
- ### ForvoFrench.sub ###
- \b([cdjlmnstç]|qu|jusqu|lorsqu|puisqu|presqu|quelqu|aujourd)[’ʼ´`‘"]>>\1'
- # WTF is i’humidité or i'asie
- \bi['’´]([aeiouéèh])>>l'\1
- [’`‘]>>'
- ‐>>-
- , >>,
- \. \. \.>>...
- \.$>>.
- (?<!^),$>>
- t-il>>-t-il
- # doux d'Espagne
- \bd ([ae])>>d'\1
- # y a-t-il
- \by-a-t>>y a-t
- \baujourd hui\b>>aujourd'hui
- \bemmenez moi\b>>emmenez-moi
- \bl '>>l'
- ^l >>il
- ^a (l'|la|bientôt|très bientôt|demain|plus|mesure|marée basse|quoi bon|partir|bas|bout|ce soir|consommer|mon|son|point|vous|nous|toi|vos|tes|table|quelle heure|combien|gauche)\b>>à \1
- # convert back: a l'air (sympa) / à l'air libre
- ^à l'air$>>a l'air
- # convert back: a l'intention / à l'intention de (qqn)
- ^à l'intention$>>a l'intention
- \btu a\b>>tu as
- \bca\b(?!\.$)>>ça
- \bapr[eé]s\b>>après
- \b(baptist|berg|boni|brugui|carr|carri|cimeti|courri|cuisini|elzi|f|ferr|ferri|fremi|fr|fresni|fureti|gibeci|goug|gravi|grenouilli|laudoni|lecl|lotbini|lumi|massi|meissoni|messali|orni|rivi|salonni|truch|vassi|vell)ere\b>>\1ère
- \b(bouch|bussi|chevali|coug|deshouli|eygali|favi|goug|houli|humi|joncqui|mazi|palli|perri|peyri|savenni|serreudi)eres\b>>\1ères
- \bgrandm[eè]re\b>>grand-mère
- vayssìere>>vayssière
- \btaillefere\b>>tailleferre
- \byvresse legere\b>>ivresse légère
- \betre\b>>être
- saint germain des pres>>saint-germain-des-prés
- quelque-chose>>quelque chose
- a-peu-près>>à-peu-près
- \ba (paris|cannes|castelreng|bouzy|croire|grignoter|présent)\b>>à \1
- \b(parler|demander) a\b>>\1 à
- \ba l('école|'arrache|'aise|a cave)\b>>à l\1
- \b[aà] [eé]milie\b>>à émilie
- \bà beau mentir>>a beau mentir
- \btout a fait\b>>tout à fait
- \bquelle dommage\b>>quel dommage
- \b(gratin|ann|chamois|cuv|gicl|sens)ee\b>>\1ée
- \bmusee>>musée
- \brandonee\b>>randonnée
- \bepee\b>>épée
- \babimee\b>>abîmée
- \bmarne la vallee\b>>marne-la-vallée
- \bnous chargons\b>>nous chargeons
- \bje mangais\b>>je mangeais
- \bombragaient\b>>ombrageaient
- \bcontinuerent\b>>continuèrent
- \bangoul[eè]me\b>>angoulême
- \b(cr|3|6)eme\b>>\1ème
- \b(deux|trois|quatr)iemes\b>>\1ièmes
- \bpraticant\b>>pratiquant
- \bsamuel sorbiére\b>>samuel sorbière
- \ben règle génére\b>>en règle générale
- \breglelmantee\b>>réglementée
- \bmille tonneres\b>>mille tonnerres
- ",original=">>""
- ,([a-zéèêàç])>>, \1
- \bqu'est-ce qu-on fait\b>>qu'est-ce qu'on fait
- \bqeulqu'un\b>>quelqu'un
- \bje serre ia main\b>>je serre la main
- \bsi'l vous plaît\b>>s'il vous plaît
- # what on earth is this
- \babédamebondiou l'étiant pourtant\b>>abédamebondiou i'étiant pourtant
- \bau volant de\b>>au volant de
- \btenir de quelq'un\b>>tenir de quelqu'un
- \bdirac´h\b>>dirac'h
- \btiens, voila justement ma'ame baptieret\b>>tiens, voilà justement madame baptieret
- \bvoila\b>>voilà
- \bj'agis toujours a mon gré\b>>j'agis toujours à mon gré
- \bquelle date sommes-nous aujourd'hui ?>>quelle date sommes-nous aujourd'hui ?
- \bil peut y a voir plusieurs\b>>il peut y avoir plusieurs
- \bil abattit l'arbre a coups de hache\b>>il abattit l'arbre à coups de hache
- \bplein de vie et trés amical avec l'homme\b>>plein de vie et très amical avec l'homme
- \bavoir été bercé un peu prés du mur\b>>avoir été bercé un peu près du mur
- \bnous avons pris un autobus et nous sommes allés a quelque kilomètres d'alger\b>>nous avons pris un autobus et nous sommes allés à quelques kilomètres d'alger
- \bavoir une idée derrière une tête\b>>avoir une idée derrière la tête
- \bil n'y a a pas lieu de\b>>il n'y a pas lieu de
- \bappellation originale controlée\b>>appellation originale contrôlée
- \bune chaîne d'information en continu}} qui\b>>une chaîne d'information en continu qui
- \bceux-la\b>>ceux-là
- \bcelui-la\b>>celui-là
- \bgressoney saint-jean\b>>gressoney-saint-jean
- \bgressoney-la trinité\b>>gressoney-la-trinité
- \bmom petit coeur\b>>mon petit coeur
- ### end of ForvoFrench.sub ###
- '''
Add Comment
Please, Sign In to add comment