Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on May 22nd, 2012  |  syntax: None  |  size: 3.60 KB  |  hits: 7  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import re, math
  2.  
  3. # Debug flag: True=print intermediate data structures
  4. DEBUG = True
  5.  
  6. # hardcoded (for simplicity) tuple of documents to index
  7. documents = ('D1.txt','D2.txt','D3.txt')
  8.  
  9. index={}    # dictionary key: (document, term), value: frequencies/weights
  10. maxterm={}  # dictionary key: document, value: maximum number of time any term appears
  11. docfreq={}  # dictionary key: term, value: number of documents term appears in
  12. vocab=[]
  13. vocabSet = set()
  14.  
  15. def add_Term_Weights(the_index):
  16.         for ( document, term ) in sorted( index.keys() ):
  17.                 #tf=(termCountInADocument/largestTermCountInADocument)
  18.                 largest_term_count_in_a_document = maxterm[max(maxterm, key=maxterm.get)]
  19.                 term_frequency = float( float(index[ ( document, term ) ]) / float(largest_term_count_in_a_document) )
  20.                 if DEBUG:
  21.                         print 'TF====doc:%s, term:%s, docfreq:%s, largestTermCountInADoc:%d, termFreq:%f' % \
  22.                         (document, term, index[ ( document, term ) ],largest_term_count_in_a_document, term_frequency)
  23.        
  24.                 #idf=log base 2[(totalNumberofDocs)/(numberOfDocumentsContainingTerm)]
  25.                 inverse_document_frequency = math.log( float(len(documents)) / float(docfreq[term]), 2 )
  26.                 if DEBUG:
  27.                         print 'IDF====total#ofDocs:%d, term:%s, docfreq:%s, inverseDocumentFreq:%f' % \
  28.                         (len(documents), term, index[ ( document, term ) ],inverse_document_frequency)
  29.  
  30.                 term_weight = round((term_frequency * inverse_document_frequency), 2)
  31.                 if DEBUG:
  32.                         print 'term:%s ---- TERM_WEIGHT=%f' % \
  33.                         (term, term_weight)
  34.                         print '\n'
  35.                 the_index[document,term] = term_weight
  36.        
  37. def docFreqCount():
  38.     global vocabSet
  39.     numberOfDocsContainingTerm = 0
  40.     for doc in documents:
  41.         for term in vocabSet:
  42.             if( index.has_key( (doc, term) ) ):
  43.                 if not docfreq.has_key( term.lower() ):
  44.                     docfreq[term.lower()] = 1
  45.                 else:
  46.                     print docfreq[term]
  47.                     docfreq[term.lower()] += 1
  48.  
  49.  
  50. for document in documents:
  51.     if not maxterm.has_key( ( document) ):
  52.         maxterm[ document ] = 0
  53.     for line in open( document, 'r'):
  54.         terms = re.split('\W+', line)
  55.         for term in terms:
  56.             if term != '':      # Ignore null results from split at start/end of line
  57.                 vocab.append(term.lower())
  58.                 if not index.has_key( ( document, term.lower() ) ):
  59.                     index[ ( document, term.lower() ) ] = 0
  60.                 index[ ( document, term.lower() ) ] += 1
  61.                 # check if this is the highest maxterm found in document
  62.                 if ( index[ ( document, term.lower() ) ] > maxterm[ document ]):
  63.                     maxterm[ document ] = index[ ( document, term.lower() ) ]
  64.                
  65.                 #if index.has_key( document, term.lower() ):
  66.                 '''    
  67.                 if not docfreq.has_key( term.lower() ):
  68.                     docfreq[term.lower()] = 1
  69.                 else:
  70.                     print docfreq[term]
  71.                     docfreq[term.lower()] += 1
  72.                 '''
  73.  
  74.                    
  75. vocabSet = set(vocab)
  76. print vocabSet
  77. docFreqCount()
  78. print docfreq
  79.                    
  80. if DEBUG:
  81.     print '\nRaw term count:'
  82.     for ( document, term ) in sorted( index.keys() ):
  83.         print document, term, index[ ( document, term ) ]
  84.                    
  85.                    
  86. add_Term_Weights(index)
  87.  
  88. if DEBUG:
  89.     print '\nMaxterm Dict:'
  90.     print maxterm
  91.     print '\nDocFreq Dict:'
  92.     print docfreq
  93.     print max(maxterm)
  94.     #get maxterm in all docs
  95.     print maxterm[max(maxterm, key=maxterm.get)]
  96.     print '\nTerm Weights:\n'
  97.     for ( document, term ) in sorted( index.keys() ):
  98.         print document, term, index[ ( document, term ) ]