Untitled

# -*- coding: utf-8 -*-

import profile
import xml.etree.ElementTree as ET
from lxml import etree
import re
import gc
import codecs
import operator

llist = set()

wdict = dict()


def main():
    f = codecs.open('mygraph3.csv', 'w', 'utf-8')
    appends = -1

    f.flush()
    lnum = 0

    for line in codecs.open('mygraph2.csv','r', 'utf-8', 'strict', 20000000):
        lnum += 1
        if (lnum % 1000000) == 0: print 'Line number: %i mln'%(lnum/1000000)
        words = line.split('\t')
        words0 = words[0].strip().lower()
        words1 = words[1].strip().lower()


        if words0 in wdict:
            wdict[words0] += 1
        else:
            wdict[words0] = 1

        if words1 in wdict:
            wdict[words1] += 1
        else:
            wdict[words1] = 1


    swd = sorted(wdict.iteritems(), key=operator.itemgetter(1))

    ind = 0

    for elem in swd:
        f.write(str(ind))
        f.write('\t')
        f.write(elem[0])
        f.write('\t')
        f.write(str(elem[1]))
        f.write('\n')
        ind+=1

    f.close()


main()