Advertisement
Guest User

Untitled

a guest
Jan 29th, 2015
193
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.58 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. """
  4. split_ln.py
  5.  
  6. Created by Neal Caren on 2012-05-14.
  7. neal.caren@unc.edu
  8.  
  9. Edited by Alex Hanna on 2015-01-29
  10. alex.hanna@gmail.com
  11.  
  12. Takes a downloaded plain text LexisNexis file and converts it into a CSV file or set of flat files.
  13.  
  14. """
  15.  
  16. import argparse, csv, os, re, sys
  17. from datetime import datetime
  18.  
  19. parser = argparse.ArgumentParser(description='Parse Lexis-Nexis files into different outputs.')
  20. parser.add_argument('files', metavar='file', type=str, nargs='+', help='Lexis-Nexis files to be parsed.')
  21. parser.add_argument('--output_dir', dest='output', action='store', help='Directory in which to store the output.')
  22. parser.add_argument('--sep', dest='sep', const='sep', default='csv', action='store_const',
  23. help = 'Flag to store output in separate files.')
  24.  
  25. args = parser.parse_args()
  26.  
  27. if args.output:
  28. if not os.path.isdir(args.output):
  29. print("Not a valid directory.")
  30. sys.exit(-1)
  31. else:
  32. args.output = "."
  33.  
  34. ## set permanent columns
  35. header = ['SEARCH_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']
  36.  
  37. if args.sep == 'csv':
  38. ## use today as a hash to store
  39. today_str = datetime.today().strftime('%Y-%m-%d')
  40. outname = "%s/lexis-nexis_%s.csv" % (args.output, today_str)
  41.  
  42. # setup the output file
  43. outfile = open(outname,'wb')
  44. writer = csv.writer(outfile)
  45.  
  46. for fn in args.files:
  47. print('Processing %s' % fn)
  48. header_written = False
  49.  
  50. # read the file
  51. lnraw = open(fn).read()
  52.  
  53. # silly hack to find the end of the documents
  54. workfile = re.sub(' Copyright .*?\\r\\n','ENDOFILE',lnraw)
  55.  
  56. # clean up crud at the beginning of the file
  57. workfile = workfile.replace('\xef\xbb\xbf\r\n','')
  58.  
  59. # split the file into a list of documents
  60. workfile = workfile.split('ENDOFILE')
  61.  
  62. # remove blank rows
  63. workfile = [f for f in workfile if len(f.split('\r\n\r\n')) > 2]
  64.  
  65. # Figure out what metadata is being reported
  66. meta_list = list(set(re.findall('\\n([A-Z][A-Z-]*?):',lnraw)))
  67.  
  68. # Keep only the commonly occuring metadata
  69. meta_list = [m for m in meta_list if float(lnraw.count(m)) / len(workfile) > .20]
  70.  
  71. if args.sep == 'csv':
  72. header.extend(meta_list)
  73. header.append('TEXT')
  74.  
  75. ## write header if this hasn't been done
  76. ## TK: Not sure how to deal with the case where metadata changes
  77. ## between different input files
  78. if not header_written:
  79. writer.writerow(header)
  80. header_written = True
  81.  
  82. ## Begin loop over each article
  83. for f in workfile:
  84. # Split into lines, and clean up the hard returns at the end of each line.
  85. # Also removes blank lines that the occasional copyright lines
  86. filessplit = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0 and 'All Rights Reserved' not in row]
  87.  
  88. ## make metadata dict
  89. meta_dict = {k : '' for k in header}
  90.  
  91. doc_id = filessplit[0].strip().split(' ')[0]
  92. pub = filessplit[1].strip()
  93. date_ed = filessplit[2].strip()
  94. title = filessplit[3].strip()
  95.  
  96. ## format date into YYYY-MM-DD
  97. da = date_ed.replace(',', '').split()
  98. date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
  99. date = date.strftime("%Y-%m-%d")
  100.  
  101. ## format edition
  102. ## TK: maybe remove?
  103. ed = date_ed.replace(date,'').split(' ')[-1].lstrip()
  104.  
  105. ## if edition is a time or day, skip it
  106. if 'GMT' in ed or 'day' in ed:
  107. ed = ''
  108.  
  109. ## Edit the text and other information
  110. paragraphs = []
  111. for line in filessplit[5:]:
  112. ## find out if this line is part of the main text
  113. if len(line) > 0 and line[:2] != ' ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
  114. ## remove new lines
  115. line = re.sub(r'\s+', ' ', line)
  116.  
  117. ## not sure what this does
  118. line = line.replace('","','" , "')
  119.  
  120. ## add to paragraph array
  121. paragraphs.append(line)
  122. else:
  123. metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
  124. if len(metacheck) > 0:
  125. if metacheck[0] in meta_list:
  126. meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')
  127.  
  128. ## put everything in the metadata dictionary
  129. meta_dict['PUBLICATION'] = pub
  130. meta_dict['SEARCH_ID'] = doc_id
  131. meta_dict['DATE'] = date
  132. meta_dict['TITLE'] = title
  133. meta_dict['EDITION'] = ed
  134.  
  135. if args.sep == 'csv':
  136. ## add the text to the dict to write
  137. meta_dict['TEXT'] = " ".join(paragraphs)
  138.  
  139. # Output the results to a single csv file
  140. writer.writerow( [ meta_dict[x] for x in header ] )
  141. else:
  142. ## otherwise, store as separate files
  143. ## put each piece of meta info on a single line
  144. out = "%s/%s_%s.txt" % (args.output, doc_id, date)
  145. fh = open(out, 'w')
  146.  
  147. ## write title and date first for separate files
  148. fh.write('TITLE: %s\n' % meta_dict['TITLE'])
  149. fh.write('DATE: %s\n' % meta_dict['DATE'])
  150.  
  151. ## then write the rest
  152. for k,v in meta_dict.iteritems():
  153. if k not in ['TITLE', 'DICT']:
  154. fh.write('%s: %s\n' % (k,v))
  155.  
  156. ## write the text last
  157. fh.write("\n\n".join(paragraphs) + "\n")
  158.  
  159. fh.close()
  160.  
  161. print('Wrote %s' % doc_id)
  162.  
  163. if args.sep == 'csv':
  164. outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement