Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import wikipediaapi
- from chord import Chord
- import copy
- import IPython
- wiki_wiki = wikipediaapi.Wikipedia('en')
- categoryDict = {}
- pageList = []
- blacklist = ["mdy", "dmy", "article", "Article", "Page", "page", "Wiki", "CS1", "AC with"]
- class page:
- def __init__(self, title):
- self.cats = []
- self.title = title
- def addCat(self, newCat):
- self.cats.append(newCat)
- def setCats(self, newCats):
- self.cats = copy.deepcopy(newCats)
- def getCats(self):
- return self.cats
- def getTitle(self):
- return self.title
- def getCategories(pages): #Assigns categories to a list of pages
- for p in pages:
- print(p.getTitle())
- for x in wiki_wiki.page(p.getTitle()).categories:
- if not any(word in x for word in blacklist):
- p.addCat(x)
- for x in wiki_wiki.page("Wikipedia:Vital Articles").links:
- if ":" not in x and "article" not in x and "Article" not in x: #Removes all articles starting "Category:" or "Articles with..."
- pageList.append(page(x))
- getCategories(pageList)
- print("Done!")
- for x in pageList:
- for y in x.getCats():
- if y not in categoryDict:
- categoryDict.update({y:1})
- else:
- categoryDict[y] += 1
- categoryDict = {key:val for key, val in categoryDict.items() if val >= 4}
- numCats = len(categoryDict)
- print(numCats)
- catNames = list(categoryDict.keys())
- categoryOccurences = [[0 for x in range(numCats)] for y in range(numCats)]
- def getCategoryCrosses(cat1, cat2):
- for p in pageList:
- if cat1 in p.getCats() and cat2 in p.getCats():
- categoryOccurences[catNames.index(cat1)][catNames.index(cat2)] += 1
- categoryOccurences[catNames.index(cat2)][catNames.index(cat1)] += 1
- for x in catNames:
- print(x, catNames.index(x), len(catNames))
- for y in catNames:
- getCategoryCrosses(x, y)
- for x in categoryOccurences:
- print(x)
- Chord(categoryOccurences, catNames).show()
Add Comment
Please, Sign In to add comment