Untitled

import os
from collections import defaultdict
from math import log, log10

from os.path import isfile, join

import psycopg2


def main():
    allFilesAmount = calculateFilesAmount()
    wordToFilesAmountMap = createWordToNumberOfFilesWhereItOccursMap()

    for k in wordToFilesAmountMap.keys():
        fileToWeightMap = calculateWeights(k, wordToFilesAmountMap[k], allFilesAmount)
        saveOnDB(k, fileToWeightMap)


def saveOnDB(word, fileToWeightMap):
    dbConnection = None
    try:
        dbConnection = psycopg2.connect(host="localhost", database="pjn", user="pjn", password="pjn")
        cur = dbConnection.cursor()
        for k in fileToWeightMap:
            cur.execute("INSERT INTO weights (word, weight, filename) VALUES (%s,%s,%s)", [word, fileToWeightMap[k], k])
            dbConnection.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if dbConnection is not None:
            dbConnection.close()


def calculateWeights(wordToCalculate: str, numberOfFilesWhereWordExists: int, allFilesAmount: int) -> defaultdict:
    idf = log10(float(allFilesAmount / numberOfFilesWhereWordExists))
    fileToWeightMap = defaultdict()

    articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
    allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
    for fileName in allFiles:
        allWords = 0
        weight = 0
        givenWordOccurrences = 0

        with open(articlesPath + fileName, 'r', encoding="utf8") as file:
            for word in file.read().split():
                lowerWord = word.lower()
                allWords = allWords + 1
                if lowerWord == wordToCalculate:
                    givenWordOccurrences = givenWordOccurrences + 1

        if idf > 0 and allWords > 0:
            weight = float(givenWordOccurrences / allWords) * idf

        if weight > 0:
            fileToWeightMap[fileName] = weight

    return fileToWeightMap


def calculateFilesAmount() -> int:
    dirPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
    return len([name for name in os.listdir(dirPath) if os.path.isfile(dirPath + name)])


def createWordToNumberOfFilesWhereItOccursMap() -> defaultdict:
    articlesPath = 'D:\\Programowanie\\PJN\\PJNIndexerAndBrowserBackend\\articles\\'
    allFiles = [f for f in os.listdir(articlesPath) if isfile(join(articlesPath, f))]
    wordToNumberOfFilesWhereItOccursMap = defaultdict()
    for fileName in allFiles:
        distinctWordsSet = set()
        with open(articlesPath + fileName, 'r', encoding="utf8") as file:
            for word in file.read().split():
                lowerWord = word.lower()
                if lowerWord not in distinctWordsSet:
                    distinctWordsSet.add(lowerWord)

            for word in distinctWordsSet:
                wordToNumberOfFilesWhereItOccursMap[word] = wordToNumberOfFilesWhereItOccursMap.get(word, 0) + 1

    return wordToNumberOfFilesWhereItOccursMap


if __name__ == "__main__":
    main()