Advertisement
Guest User

Untitled

a guest
Nov 10th, 2018
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.08 KB | None | 0 0
  1. import os
  2. from collections import defaultdict
  3. from math import log, log10
  4.  
  5. from os.path import isfile, join
  6.  
  7. import psycopg2
  8.  
  9.  
  10. def main():
  11.     allFilesAmount = calculateFilesAmount()
  12.     wordToFilesAmountMap = createWordToNumberOfFilesWhereItOccursMap()
  13.  
  14.     for k in wordToFilesAmountMap.keys():
  15.         fileToWeightMap = calculateWeights(k, wordToFilesAmountMap[k], allFilesAmount)
  16.         saveOnDB(k, fileToWeightMap)
  17.  
  18.  
  19. def saveOnDB(word, fileToWeightMap):
  20.     dbConnection = None
  21.     try:
  22.         dbConnection = psycopg2.connect(host="localhost", database="pjn", user="pjn", password="pjn")
  23.         cur = dbConnection.cursor()
  24.         for k in fileToWeightMap:
  25.             cur.execute("INSERT INTO weights (word, weight, filename) VALUES (%s,%s,%s)", [word, fileToWeightMap[k], k])
  26.             dbConnection.commit()
  27.     except (Exception, psycopg2.DatabaseError) as error:
  28.         print(error)
  29.     finally:
  30.         if dbConnection is not None:
  31.             dbConnection.close()
  32.  
  33.  
  34. def calculateWeights(wordToCalculate: str, numberOfFilesWhereWordExists: int, allFilesAmount: int) -> defaultdict:
  35.     idf = log10(float(allFilesAmount / numberOfFilesWhereWordExists))
  36.     fileToWeightMap = defaultdict()
  37.  
  38.     articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
  39.     allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
  40.     for fileName in allFiles:
  41.         allWords = 0
  42.         weight = 0
  43.         givenWordOccurrences = 0
  44.  
  45.         with open(articlesPath + fileName, 'r', encoding="utf8") as file:
  46.             for word in file.read().split():
  47.                 lowerWord = word.lower()
  48.                 allWords = allWords + 1
  49.                 if lowerWord == wordToCalculate:
  50.                     givenWordOccurrences = givenWordOccurrences + 1
  51.  
  52.         if idf > 0 and allWords > 0:
  53.             weight = float(givenWordOccurrences / allWords) * idf
  54.  
  55.         if weight > 0:
  56.             fileToWeightMap[fileName] = weight
  57.  
  58.     return fileToWeightMap
  59.  
  60.  
  61. def calculateFilesAmount() -> int:
  62.     dirPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
  63.     return len([name for name in os.listdir(dirPath) if os.path.isfile(dirPath + name)])
  64.  
  65.  
  66. def createWordToNumberOfFilesWhereItOccursMap() -> defaultdict:
  67.     articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
  68.     allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
  69.     wordToNumberOfFilesWhereItOccursMap = defaultdict()
  70.     for fileName in allFiles:
  71.         distinctWordsSet = set()
  72.         with open(articlesPath + fileName, 'r', encoding="utf8") as file:
  73.             for word in file.read().split():
  74.                 lowerWord = word.lower()
  75.                 if lowerWord not in distinctWordsSet:
  76.                     distinctWordsSet.add(lowerWord)
  77.  
  78.             for word in distinctWordsSet:
  79.                 wordToNumberOfFilesWhereItOccursMap[word] = wordToNumberOfFilesWhereItOccursMap.get(word, 0) + 1
  80.  
  81.     return wordToNumberOfFilesWhereItOccursMap
  82.  
  83.  
  84. if __name__ == "__main__":
  85.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement