Advertisement
Guest User

Untitled

a guest
Feb 5th, 2024
21
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.49 KB | None | 0 0
  1. import requests
  2. from parsel import Selector
  3. import pandas as pd
  4. from tqdm import tqdm
  5. from concurrent.futures import ThreadPoolExecutor
  6. import os
  7. import http.cookiejar
  8. from itertools import repeat
  9. import csv
  10. import re
  11. from datetime import datetime
  12. import time
  13.  
  14. # Set user agent
  15. headers = {
  16. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
  17. }
  18.  
  19. # Function to extract information from URL
  20. def extract_info(url, cookies):
  21. try:
  22. # Use cookies for all requests
  23. response = requests.get(url, headers=headers, cookies=cookies)
  24.  
  25. if response.status_code == 200:
  26. selector = Selector(response.text)
  27.  
  28. # Extract upload date directly from JSON-LD script
  29. upload_date = selector.xpath('//script[@type="application/ld+json"]/text()').re_first(r'"uploadDate": "([^"]+)"')
  30. duration_str = selector.xpath('//script[@type="application/ld+json"]/text()').re_first(r'"duration": "([^"]+)"')
  31.  
  32. # Extract other information from the page
  33. title = selector.css('h1::text').get()
  34. description = selector.css('#descript-cont::text').get() # Extract description
  35. views_str = selector.css('.video-stat svg.views+span::text').get() # Extract views
  36. comments_str = selector.css('.comments-label .num::text').get()
  37. channel_name = selector.css('.channel-name span::text').get()
  38. channel_path = selector.css('.channel-name::attr(href)').get()
  39. channel_url = f"https://www.vbox7.com{channel_path}" if channel_path else None
  40.  
  41. # Preprocess thumbnail URL to calculate it
  42. video_id = url.split(':')[-1] # Extract video ID from URL
  43. thumbnail = f"https://i49.vbox7.com/o/{video_id[:3]}/{video_id}0.jpg" # Calculate thumbnail URL
  44.  
  45. # Strip white lines and whitespace characters
  46. description = ' '.join(description.strip().split()) if description else None
  47. channel_name = channel_name.strip() if channel_name else None
  48.  
  49. # Remove whitespace characters and convert views and comments to integers
  50. views = int(views_str.replace(' ', '')) if views_str else None
  51. comments = int(comments_str.replace(' ', '')) if comments_str else None
  52.  
  53. # Return data as tuple
  54. return (url, thumbnail, title, description, duration_str, upload_date, views, comments, channel_name, channel_url)
  55. else:
  56. return None
  57. except requests.exceptions.RequestException as e:
  58. print(f"Error occurred: {e}")
  59. return None
  60.  
  61. # Main function
  62. def main():
  63. # Prompt user for the path to the TXT file
  64. txt_file_path = input("Please enter the path to the TXT file: ")
  65.  
  66. # Prompt user for chunk size and calculate the number of ranges
  67. chunk_size = int(input("Enter the chunk size: "))
  68. total_lines = sum(1 for line in open(txt_file_path))
  69. num_ranges = total_lines // chunk_size
  70. print(f"Total lines: {total_lines}, Chunk size: {chunk_size}, Number of ranges: {num_ranges}")
  71.  
  72. # Prompt user to select a specific range
  73. range_selection = int(input(f"Enter the range number (1 - {num_ranges}): "))
  74. start_index = (range_selection - 1) * chunk_size
  75. end_index = min(range_selection * chunk_size, total_lines)
  76. print(f"Selected range: {range_selection}, Start index: {start_index}, End index: {end_index}")
  77.  
  78. # Output CSV file name based on input TXT file name and selected range
  79. output_file = os.path.splitext(txt_file_path)[0] + f'_range{range_selection}_videos.csv'
  80.  
  81. # Check if cookies.txt exists in the current folder
  82. if os.path.exists("cookies.txt"):
  83. # Load the cookies from the file using http.cookiejar
  84. cookie_jar = http.cookiejar.MozillaCookieJar("cookies.txt")
  85. cookie_jar.load(ignore_discard=True, ignore_expires=True)
  86. cookies = requests.utils.dict_from_cookiejar(cookie_jar)
  87. else:
  88. # No cookies found, use an empty cookie jar
  89. cookies = requests.cookies.RequestsCookieJar()
  90.  
  91. # Write header row to CSV
  92. header_row = ['url', 'thumbnail', 'title', 'description', 'duration', 'upload_date', 'views', 'comments', 'channel_name', 'channel_url']
  93. pd.DataFrame([header_row]).to_csv(output_file, index=False, header=False)
  94.  
  95. # Read IDs from TXT file in the selected range and prepend URL prefix
  96. with open(txt_file_path, 'r') as file:
  97. ids = [f"https://www.vbox7.com/play:{line.strip()}" for line in file.readlines()[start_index:end_index]]
  98.  
  99. # Initialize progress bar
  100. with tqdm(total=len(ids), desc="Processing IDs") as pbar:
  101. # Use ThreadPoolExecutor for multithreading
  102. with ThreadPoolExecutor(max_workers=5) as executor:
  103. # Extract information for each ID
  104. for data in executor.map(extract_info, ids, repeat(cookies)):
  105. try:
  106. if data is not None: # Check if data is not None
  107. # Write data to CSV using csv module to ensure quoting
  108. with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
  109. csvwriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
  110. csvwriter.writerow(data)
  111. else:
  112. print("Error: Data is None")
  113. # Log the last unprocessed ID
  114. last_unprocessed_id = ids[pbar.n]
  115. print(f"Last unprocessed ID: {last_unprocessed_id}")
  116.  
  117. # Set a 1-minute timer before continuing
  118. print("Waiting for 1 minute before continuing...")
  119. time.sleep(60)
  120.  
  121. # Continue to the next iteration
  122. continue
  123.  
  124. except requests.exceptions.SSLError as e:
  125. print(f"SSLError occurred: {e}")
  126. # Log the last unprocessed ID
  127. last_unprocessed_id = ids[pbar.n]
  128. print(f"Last unprocessed ID: {last_unprocessed_id}")
  129.  
  130. # Set a 1-minute timer before continuing
  131. print("Waiting for 1 minute before continuing...")
  132. time.sleep(60)
  133.  
  134. # Continue to the next iteration
  135. continue
  136.  
  137. # Update progress bar
  138. pbar.update(1)
  139.  
  140. if __name__ == "__main__":
  141. main()
  142.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement