Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import profile
- import xml.etree.ElementTree as ET
- from lxml import etree
- import re
- import gc
- import codecs
- import operator
- llist = set()
- wdict = dict()
- def main():
- f = codecs.open('mygraph3.csv', 'w', 'utf-8')
- appends = -1
- f.flush()
- lnum = 0
- for line in codecs.open('mygraph2.csv','r', 'utf-8', 'strict', 20000000):
- lnum += 1
- if (lnum % 1000000) == 0: print 'Line number: %i mln'%(lnum/1000000)
- words = line.split('\t')
- words0 = words[0].strip().lower()
- words1 = words[1].strip().lower()
- if words0 in wdict:
- wdict[words0] += 1
- else:
- wdict[words0] = 1
- if words1 in wdict:
- wdict[words1] += 1
- else:
- wdict[words1] = 1
- swd = sorted(wdict.iteritems(), key=operator.itemgetter(1))
- ind = 0
- for elem in swd:
- f.write(str(ind))
- f.write('\t')
- f.write(elem[0])
- f.write('\t')
- f.write(str(elem[1]))
- f.write('\n')
- ind+=1
- f.close()
- main()
Advertisement
Add Comment
Please, Sign In to add comment