Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- This is a foo bar sentence .
- And this is the first txtfile in the corpus .
- Counter({('i', 's', '</w>'): 2, ('t', 'h', 'e', '</w>'): 2, ('.', '</w>'): 2, ('T', 'h', 'i', 's', '</w>'): 1, ('f', 'i', 'r', 's', 't', '</w>'): 1, ('t', 'x', 't', 'f', 'i', 'l', 'e', '</w>'): 1, ('f', 'o', 'o', '</w>'): 1, ('t', 'h', 'i', 's', '</w>'): 1, ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '</w>'): 1, ('A', 'n', 'd', '</w>'): 1, ('b', 'a', 'r', '</w>'): 1, ('c', 'o', 'r', 'p', 'u', 's', '</w>'): 1, ('a', '</w>'): 1, ('i', 'n', '</w>'): 1})
- $ echo -e """This is a foo bar sentence .nAnd this is the first txtfile in the corpus .""" > test.txt
- $ cat test.txt
- This is a foo bar sentence .
- And this is the first txtfile in the corpus .
- $ python
- >>> from collections import Counter
- >>> open('test.txt').read().split()
- ['This', 'is', 'a', 'foo', 'bar', 'sentence', '.', 'And', 'this', 'is', 'the', 'first', 'txtfile', 'in', 'the', 'corpus', '.']
- >>> Counter(open('test.txt').read().split())
- Counter({'is': 2, '.': 2, 'the': 2, 'a': 1, 'And': 1, 'bar': 1, 'sentence': 1, 'This': 1, 'txtfile': 1, 'this': 1, 'in': 1, 'foo': 1, 'corpus': 1, 'first': 1})
- >>> Counter(map(lambda x: tuple(list(x)+['</w>']), open('test.txt').read().split()))
- Counter({('i', 's', '</w>'): 2, ('t', 'h', 'e', '</w>'): 2, ('.', '</w>'): 2, ('T', 'h', 'i', 's', '</w>'): 1, ('f', 'i', 'r', 's', 't', '</w>'): 1, ('t', 'x', 't', 'f', 'i', 'l', 'e', '</w>'): 1, ('f', 'o', 'o', '</w>'): 1, ('t', 'h', 'i', 's', '</w>'): 1, ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '</w>'): 1, ('A', 'n', 'd', '</w>'): 1, ('b', 'a', 'r', '</w>'): 1, ('c', 'o', 'r', 'p', 'u', 's', '</w>'): 1, ('a', '</w>'): 1, ('i', 'n', '</w>'): 1})
- >>> x = Counter()
- >>> for line in open('test.txt'):
- ... for word in line.split():
- ... x[word]+=1
- ...
- >>> x = Counter({tuple(list(k)+['</w>']):v for k,v in x.items()})
- >>> x
- Counter({('i', 's', '</w>'): 2, ('t', 'h', 'e', '</w>'): 2, ('.', '</w>'): 2, ('T', 'h', 'i', 's', '</w>'): 1, ('t', 'x', 't', 'f', 'i', 'l', 'e', '</w>'): 1, ('f', 'o', 'o', '</w>'): 1, ('t', 'h', 'i', 's', '</w>'): 1, ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '</w>'): 1, ('f', 'i', 'r', 's', 't', '</w>'): 1, ('b', 'a', 'r', '</w>'): 1, ('c', 'o', 'r', 'p', 'u', 's', '</w>'): 1, ('a', '</w>'): 1, ('i', 'n', '</w>'): 1, ('A', 'n', 'd', '</w>'): 1})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement