Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # encoding: utf-8
- """
- split_ln.py
- Created by Neal Caren on 2012-05-14.
- neal.caren@unc.edu
- Edited by Alex Hanna on 2015-01-29
- alex.hanna@gmail.com
- Takes a downloaded plain text LexisNexis file and converts it into a CSV file or set of flat files.
- """
- import argparse, csv, os, re, sys
- from datetime import datetime
- parser = argparse.ArgumentParser(description='Parse Lexis-Nexis files into different outputs.')
- parser.add_argument('files', metavar='file', type=str, nargs='+', help='Lexis-Nexis files to be parsed.')
- parser.add_argument('--output_dir', dest='output', action='store', help='Directory in which to store the output.')
- parser.add_argument('--sep', dest='sep', const='sep', default='csv', action='store_const',
- help = 'Flag to store output in separate files.')
- args = parser.parse_args()
- if args.output:
- if not os.path.isdir(args.output):
- print("Not a valid directory.")
- sys.exit(-1)
- else:
- args.output = "."
- ## set permanent columns
- header = ['SEARCH_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']
- if args.sep == 'csv':
- ## use today as a hash to store
- today_str = datetime.today().strftime('%Y-%m-%d')
- outname = "%s/lexis-nexis_%s.csv" % (args.output, today_str)
- # setup the output file
- outfile = open(outname,'wb')
- writer = csv.writer(outfile)
- for fn in args.files:
- print('Processing %s' % fn)
- header_written = False
- # read the file
- lnraw = open(fn).read()
- # silly hack to find the end of the documents
- workfile = re.sub(' Copyright .*?\\r\\n','ENDOFILE',lnraw)
- # clean up crud at the beginning of the file
- workfile = workfile.replace('\xef\xbb\xbf\r\n','')
- # split the file into a list of documents
- workfile = workfile.split('ENDOFILE')
- # remove blank rows
- workfile = [f for f in workfile if len(f.split('\r\n\r\n')) > 2]
- # Figure out what metadata is being reported
- meta_list = list(set(re.findall('\\n([A-Z][A-Z-]*?):',lnraw)))
- # Keep only the commonly occuring metadata
- meta_list = [m for m in meta_list if float(lnraw.count(m)) / len(workfile) > .20]
- if args.sep == 'csv':
- header.extend(meta_list)
- header.append('TEXT')
- ## write header if this hasn't been done
- ## TK: Not sure how to deal with the case where metadata changes
- ## between different input files
- if not header_written:
- writer.writerow(header)
- header_written = True
- ## Begin loop over each article
- for f in workfile:
- # Split into lines, and clean up the hard returns at the end of each line.
- # Also removes blank lines that the occasional copyright lines
- filessplit = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0 and 'All Rights Reserved' not in row]
- ## make metadata dict
- meta_dict = {k : '' for k in header}
- doc_id = filessplit[0].strip().split(' ')[0]
- pub = filessplit[1].strip()
- date_ed = filessplit[2].strip()
- title = filessplit[3].strip()
- ## format date into YYYY-MM-DD
- da = date_ed.replace(',', '').split()
- date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
- date = date.strftime("%Y-%m-%d")
- ## format edition
- ## TK: maybe remove?
- ed = date_ed.replace(date,'').split(' ')[-1].lstrip()
- ## if edition is a time or day, skip it
- if 'GMT' in ed or 'day' in ed:
- ed = ''
- ## Edit the text and other information
- paragraphs = []
- for line in filessplit[5:]:
- ## find out if this line is part of the main text
- if len(line) > 0 and line[:2] != ' ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
- ## remove new lines
- line = re.sub(r'\s+', ' ', line)
- ## not sure what this does
- line = line.replace('","','" , "')
- ## add to paragraph array
- paragraphs.append(line)
- else:
- metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
- if len(metacheck) > 0:
- if metacheck[0] in meta_list:
- meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')
- ## put everything in the metadata dictionary
- meta_dict['PUBLICATION'] = pub
- meta_dict['SEARCH_ID'] = doc_id
- meta_dict['DATE'] = date
- meta_dict['TITLE'] = title
- meta_dict['EDITION'] = ed
- if args.sep == 'csv':
- ## add the text to the dict to write
- meta_dict['TEXT'] = " ".join(paragraphs)
- # Output the results to a single csv file
- writer.writerow( [ meta_dict[x] for x in header ] )
- else:
- ## otherwise, store as separate files
- ## put each piece of meta info on a single line
- out = "%s/%s_%s.txt" % (args.output, doc_id, date)
- fh = open(out, 'w')
- ## write title and date first for separate files
- fh.write('TITLE: %s\n' % meta_dict['TITLE'])
- fh.write('DATE: %s\n' % meta_dict['DATE'])
- ## then write the rest
- for k,v in meta_dict.iteritems():
- if k not in ['TITLE', 'DICT']:
- fh.write('%s: %s\n' % (k,v))
- ## write the text last
- fh.write("\n\n".join(paragraphs) + "\n")
- fh.close()
- print('Wrote %s' % doc_id)
- if args.sep == 'csv':
- outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement