Advertisement
Guest User

Untitled

a guest
Jan 24th, 2017
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.92 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import sys, os, string
  4. from nltk.tokenize import WordPunctTokenizer
  5.  
  6. if len(sys.argv) < 3:
  7. print("Usage: %s BRATDIR COLFILE [BASENAME]" % sys.argv[0])
  8. sys.exit(1)
  9.  
  10. if len(sys.argv) > 3:
  11. basename_match = sys.argv[3]
  12. else:
  13. basename_match = None
  14.  
  15. brat_data_dirname = sys.argv[1]
  16. col_filename = sys.argv[2]
  17.  
  18. tokenizer = WordPunctTokenizer()
  19.  
  20. with open(col_filename, 'w') as col_file:
  21. for (dirpath, dirnames, filenames) in os.walk(brat_data_dirname):
  22. for filename in filenames:
  23. if not filename.endswith('.ann'): continue
  24. basename = filename.split('.')[0]
  25. if basename_match is not None and basename != basename_match: continue
  26.  
  27. print("===> Processing %s" % basename)
  28.  
  29. ann_path = os.path.join(brat_data_dirname, filename)
  30. if os.stat(ann_path).st_size > 0:
  31. doc_path = os.path.join(brat_data_dirname, '%s.txt' % basename)
  32.  
  33. with open(doc_path, 'r') as doc_file, open(ann_path, 'r') as ann_file:
  34. start_dict = {}
  35. end_dict = {}
  36. for (id, type_pos, entity) in (tuple(line.strip().split('\t')) for line in ann_file):
  37. (type, start, end) = type_pos.split(' ')
  38. start_dict[start] = type
  39. end_dict[end] = type
  40.  
  41. content = doc_file.read()
  42. ann_content = ""
  43.  
  44. active_type = 'O'
  45. for (start, end) in tokenizer.span_tokenize(content):
  46. if str(start) in start_dict:
  47. active_type = start_dict[str(start)]
  48.  
  49. #print("%s\t%s" % (content[start:end], active_type))
  50. col_file.write("%s\t%s\n" % (content[start:end], active_type))
  51.  
  52. if str(end) in end_dict:
  53. active_type = 'O' 
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement