CleaningText

#The code below is for a textfile containing just one item. I am not sure how to tweak this to make it run for listfile.text (which contains raw data from multiple articles)

with open('listfilereduced.txt', 'r', encoding='utf8') as my_file:
    rawData = my_file.read()
    print(rawData)

#Separating body text from other data

articleStart = rawData.find("<div class=\"story-element story-element-text\">")
articleData = rawData[:articleStart]
articleBody = rawData[articleStart:]
print(articleData)
print("*******")
print(articleBody)
print("*******")

#First, I define a function to strip tags from the body text

def stripTags(pageContents):
    insideTag = 0
    text = ''

    for char in pageContents:
        if char == '<':
            insideTag = 1
        elif (insideTag == 1 and char == '>'):
            insideTag = 0
        elif insideTag == 1:
            continue
        else:
            text += char
    return text

#Calling the function
articleBodyText = stripTags(articleBody)
print(articleBodyText)

##Isolating article title and publication date

TitleEndLoc = articleData.find("</h1>")
dateStartLoc = articleData.find("<div class=\"storyPageMetaData-m__publish-time__19bdV\">")
dateEndLoc=articleData.find("<div class=\"meta-data-icons storyPageMetaDataIcons-m__icons__3E4Xg\">")
titleString = articleData[:TitleEndLoc]
dateString = articleData[dateStartLoc:dateEndLoc]


##Call stripTags to clean
articleTitle= stripTags(titleString)
articleDate = stripTags(dateString)

print(articleTitle)
print(articleDate)

#Cleaning the date a bit more
startLocDate = articleDate.find(":")
endLocDate = articleDate.find(",")
articleDateClean = articleDate[startLocDate+2:endLocDate]
print(articleDateClean)

#save all this data to a dictionary that saves the title, data and the body text
PAloTextDict = {"Title": articleTitle, "Date": articleDateClean, "Text": articleBodyText}
print(PAloTextDict)

#Normalize text by:

#1. Splitting paragraphs of text into lists of words

articleBodyWordList = articleBodyText.split()
print(articleBodyWordList)

#2.Removing punctuation and stopwords

from bnlp.corpus import stopwords, punctuations

#A. Remove punctuation first

listNoPunct = []

for word in articleBodyWordList:
    for mark in punctuations:
        word=word.replace(mark, '')
    listNoPunct.append(word)
print(listNoPunct)


#B. removing stopwords
banglastopwords = stopwords()
print(banglastopwords)

cleanList=[]
for word in listNoPunct:
    if word in banglastopwords:
        continue
    else:
        cleanList.append(word)
print(cleanList)