Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- speaker_words = {}
- speaker_pattern = re.compile(r'^(\w+?):(.*)$')
- with open("transcript.txt", "r") as f:
- lines = f.readlines()
- current_speaker = None
- for line in lines:
- line = line.strip()
- match = speaker_pattern.match(line)
- if match is not None:
- current_speaker = match.group(1)
- line = match.group(2).strip()
- if current_speaker not in speaker_words.keys():
- speaker_words[current_speaker] = []
- if current_speaker:
- # you may want to do some sort of punctuation filtering too
- words = [word.strip() for word in line.split(' ') if len(word.strip()) > 0]
- speaker_words[current_speaker].extend(words)
- print speaker_words
Add Comment
Please, Sign In to add comment