Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from collections import defaultdict
- from math import log, log10
- from os.path import isfile, join
- import psycopg2
- def main():
- allFilesAmount = calculateFilesAmount()
- wordToFilesAmountMap = createWordToNumberOfFilesWhereItOccursMap()
- for k in wordToFilesAmountMap.keys():
- fileToWeightMap = calculateWeights(k, wordToFilesAmountMap[k], allFilesAmount)
- saveOnDB(k, fileToWeightMap)
- def saveOnDB(word, fileToWeightMap):
- dbConnection = None
- try:
- dbConnection = psycopg2.connect(host="localhost", database="pjn", user="pjn", password="pjn")
- cur = dbConnection.cursor()
- for k in fileToWeightMap:
- cur.execute("INSERT INTO weights (word, weight, filename) VALUES (%s,%s,%s)", [word, fileToWeightMap[k], k])
- dbConnection.commit()
- except (Exception, psycopg2.DatabaseError) as error:
- print(error)
- finally:
- if dbConnection is not None:
- dbConnection.close()
- def calculateWeights(wordToCalculate: str, numberOfFilesWhereWordExists: int, allFilesAmount: int) -> defaultdict:
- idf = log10(float(allFilesAmount / numberOfFilesWhereWordExists))
- fileToWeightMap = defaultdict()
- articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
- allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
- for fileName in allFiles:
- allWords = 0
- weight = 0
- givenWordOccurrences = 0
- with open(articlesPath + fileName, 'r', encoding="utf8") as file:
- for word in file.read().split():
- lowerWord = word.lower()
- allWords = allWords + 1
- if lowerWord == wordToCalculate:
- givenWordOccurrences = givenWordOccurrences + 1
- if idf > 0 and allWords > 0:
- weight = float(givenWordOccurrences / allWords) * idf
- if weight > 0:
- fileToWeightMap[fileName] = weight
- return fileToWeightMap
- def calculateFilesAmount() -> int:
- dirPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
- return len([name for name in os.listdir(dirPath) if os.path.isfile(dirPath + name)])
- def createWordToNumberOfFilesWhereItOccursMap() -> defaultdict:
- articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
- allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
- wordToNumberOfFilesWhereItOccursMap = defaultdict()
- for fileName in allFiles:
- distinctWordsSet = set()
- with open(articlesPath + fileName, 'r', encoding="utf8") as file:
- for word in file.read().split():
- lowerWord = word.lower()
- if lowerWord not in distinctWordsSet:
- distinctWordsSet.add(lowerWord)
- for word in distinctWordsSet:
- wordToNumberOfFilesWhereItOccursMap[word] = wordToNumberOfFilesWhereItOccursMap.get(word, 0) + 1
- return wordToNumberOfFilesWhereItOccursMap
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement