Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from datetime import datetime
- def crawlMetaDescription(url, index):
- req = requests.get(url)
- beautifulSoup = BeautifulSoup(req.content, "html.parser")
- metaDescription = ""
- for meta in beautifulSoup.find_all("meta"):
- if meta.get("name", None) == "description":
- metaDescription = meta.get("content", None)
- metaDescriptionContentLength = len(metaDescription)
- csvLine = str(index)+";;"+url+";"+str(metaDescriptionContentLength)+";"+metaDescription
- file.write(csvLine)
- file.write("\n");
- with open('crawl_new.csv', "w") as file:
- file.write(";Sitemap;Adresse;Laenge;MetaDescription")
- file.write("\n");
- adresse = "https://draeger-it.blog/sitemap_index.xml"
- r = requests.get(adresse)
- soup = BeautifulSoup(r.content, "html.parser")
- index = 0
- for loc in soup.find_all("loc"):
- sitemapUrl = loc.text
- print(sitemapUrl)
- file.write(";"+sitemapUrl+";;;")
- file.write("\n");
- r = requests.get(sitemapUrl)
- soup = BeautifulSoup(r.content, "html.parser")
- locs = soup.find_all("loc")
- print("Gefundene URLs:"+str(len(locs)))
- now = datetime.now()
- current_time = now.strftime("%H:%M:%S")
- print("Start - Crawling ("+current_time+") " + sitemapUrl)
- for loc in locs:
- index = index+1
- url = loc.text
- print(url)
- crawlMetaDescription(url, index)
- now = datetime.now()
- current_time = now.strftime("%H:%M:%S")
- print("Ende - Crawling ("+current_time+") " + sitemapUrl)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement