Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- #Prune the database dump of all articles starting with Wikipedia namespace.
- #Usage: pv enwiki-latest-pages-articles.xml.bz2 | bunzip2 | ../prune-wiki | bzip2 > out.xml.bz2
- # pbzip2 .bz2 files don't seem to work with wikitaxi
- #File must end in .xml.bz2
- #Get the dump:
- #wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
- import sys, re
- #try cutting out <ref> to </ref> ???
- #Cutting out File:|MediaWiki:|Help: saves only 2%
- #test various cuts before publishing.
- #Note what's cut in the readme.txt
- #List of articles in namespace:
- #http://en.wikipedia.org/w/index.php?title=Special%3ASearch&redirs=1&search=a&fulltext=Search&ns8=1&title=Special%3ASearch&advanced=1&fulltext=Advanced+search
- #Help: help editing wikipedia (how to start a page)
- #MediaWiki: various website scripts?
- #File: Information about files
- #Stats:
- #Original: 6.8 GB
- #Wikipedia:|MediaWiki:|Help:|File: 5.5 GB
- #Wikipedia:|Help:MediaWiki: 5.6 GB
- #Wikipedia: 5.6 GB
- bad_titles='Wikipedia:' # <-- Best default cut
- #bad_titles+='|Help:|MediaWiki:'
- #bad_titles+='|File:'
- sys.stderr.write('Cutting '+bad_titles+'\n') #; exit(1)
- output=True
- for line in sys.stdin:
- if re.match(' <title>',line):
- title=re.split('<title>',line)[1]
- if re.match(bad_titles,title):
- output=False
- else:
- output=True
- if output:
- print line
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement