Guest User

Untitled

a guest
Feb 25th, 2013
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.96 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. import profile
  4. import xml.etree.ElementTree as ET
  5. from lxml import etree
  6. import re
  7. import gc
  8. import codecs
  9. import operator
  10.  
  11. llist = set()
  12.  
  13. wdict = dict()
  14.  
  15.  
  16. def main():
  17.     f = codecs.open('mygraph3.csv', 'w', 'utf-8')
  18.     appends = -1
  19.  
  20.     f.flush()
  21.     lnum = 0
  22.  
  23.     for line in codecs.open('mygraph2.csv','r', 'utf-8', 'strict', 20000000):
  24.         lnum += 1
  25.         if (lnum % 1000000) == 0: print 'Line number: %i mln'%(lnum/1000000)
  26.         words = line.split('\t')
  27.         words0 = words[0].strip().lower()
  28.         words1 = words[1].strip().lower()
  29.  
  30.  
  31.         if words0 in wdict:
  32.             wdict[words0] += 1
  33.         else:
  34.             wdict[words0] = 1
  35.            
  36.         if words1 in wdict:
  37.             wdict[words1] += 1
  38.         else:
  39.             wdict[words1] = 1
  40.            
  41.  
  42.     swd = sorted(wdict.iteritems(), key=operator.itemgetter(1))
  43.  
  44.     ind = 0
  45.  
  46.     for elem in swd:
  47.         f.write(str(ind))
  48.         f.write('\t')
  49.         f.write(elem[0])
  50.         f.write('\t')
  51.         f.write(str(elem[1]))
  52.         f.write('\n')
  53.         ind+=1
  54.  
  55.     f.close()
  56.    
  57.    
  58. main()
Advertisement
Add Comment
Please, Sign In to add comment