Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
253
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.69 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from datetime import datetime
  4.  
  5. def crawlMetaDescription(url, index):    
  6.     req = requests.get(url)
  7.     beautifulSoup = BeautifulSoup(req.content, "html.parser")
  8.        
  9.     metaDescription = ""
  10.     for meta in beautifulSoup.find_all("meta"):
  11.         if meta.get("name", None) == "description":
  12.             metaDescription = meta.get("content", None)
  13.        
  14.     metaDescriptionContentLength = len(metaDescription)
  15.    
  16.     csvLine = str(index)+";;"+url+";"+str(metaDescriptionContentLength)+";"+metaDescription
  17.     file.write(csvLine)
  18.     file.write("\n");
  19.    
  20. with open('crawl_new.csv', "w") as file:
  21.     file.write(";Sitemap;Adresse;Laenge;MetaDescription")
  22.     file.write("\n");
  23.    
  24.     adresse = "https://draeger-it.blog/sitemap_index.xml"
  25.     r = requests.get(adresse)
  26.     soup = BeautifulSoup(r.content, "html.parser")
  27.    
  28.     index = 0
  29.     for loc in soup.find_all("loc"):
  30.         sitemapUrl = loc.text
  31.         print(sitemapUrl)
  32.         file.write(";"+sitemapUrl+";;;")
  33.         file.write("\n");
  34.         r = requests.get(sitemapUrl)
  35.         soup = BeautifulSoup(r.content, "html.parser")
  36.         locs = soup.find_all("loc")
  37.         print("Gefundene URLs:"+str(len(locs)))
  38.         now = datetime.now()
  39.         current_time = now.strftime("%H:%M:%S")
  40.         print("Start - Crawling ("+current_time+") " + sitemapUrl)
  41.         for loc in locs:
  42.             index = index+1
  43.             url = loc.text
  44.             print(url)
  45.             crawlMetaDescription(url, index)
  46.             now = datetime.now()
  47.             current_time = now.strftime("%H:%M:%S")
  48.         print("Ende - Crawling ("+current_time+") " + sitemapUrl)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement