furas

Scrapy filmcompanion.in

May 20th, 2020
319
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.56 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. # date: 2020.05.20
  4. # https://stackoverflow.com/questions/61896358/infinite-scrolling-webpage-with-same-url-by-scapy
  5.  
  6. import json
  7. import scrapy
  8.  
  9. class InfiniteScrollingSpider(scrapy.Spider):
  10.     name = 'infinite-scrolling-pythonhelp'
  11.     scrolling_url = 'https://www.filmcompanion.in/wp-json/csco/v1/more-posts'
  12.  
  13.     def start_requests(self):
  14.         yield scrapy.FormRequest(
  15.             self.scrolling_url,
  16.             formdata={
  17.                 #'action': 'infinite_scroll',
  18.                 'action': "csco_ajax_load_more",
  19.                 'page': '1',
  20.                 "query_vars":"{\"category_name\":\"bollywood-review\",\"error\":\"\",\"m\":\"\",\"p\":0,\"post_parent\":\"\",\"subpost\":\"\",\"subpost_id\":\"\",\"attachment\":\"\",\"attachment_id\":0,\"name\":\"\",\"pagename\":\"\",\"page_id\":0,\"second\":\"\",\"minute\":\"\",\"hour\":\"\",\"day\":0,\"monthnum\":0,\"year\":0,\"w\":0,\"tag\":\"\",\"cat\":7286,\"tag_id\":\"\",\"author\":\"\",\"author_name\":\"\",\"feed\":\"\",\"tb\":\"\",\"paged\":0,\"meta_key\":\"\",\"meta_value\":\"\",\"preview\":\"\",\"s\":\"\",\"sentence\":\"\",\"title\":\"\",\"fields\":\"\",\"menu_order\":\"\",\"embed\":\"\",\"category__in\":[],\"category__not_in\":[],\"category__and\":[],\"post__in\":[],\"post__not_in\":[36896,40871,34019,37264,44457,44397],\"post_name__in\":[],\"tag__in\":[],\"tag__not_in\":[],\"tag__and\":[],\"tag_slug__in\":[],\"tag_slug__and\":[],\"post_parent__in\":[],\"post_parent__not_in\":[],\"author__in\":[],\"author__not_in\":[],\"ignore_sticky_posts\":false,\"suppress_filters\":false,\"cache_results\":true,\"update_post_term_cache\":true,\"lazy_load_term_meta\":true,\"update_post_meta_cache\":true,\"post_type\":\"\",\"posts_per_page\":12,\"nopaging\":false,\"comments_per_page\":\"50\",\"no_found_rows\":false,\"order\":\"DESC\"}",
  21.                 "query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
  22.             },
  23.             callback=self.parse_page,
  24.             meta={'page': 1},
  25.         )
  26.  
  27.     def parse_page(self, response):
  28.         next_page = response.meta.get('page') + 1
  29.         print('next_page:', next_page)
  30.         #print(response.text)
  31.  
  32.         json_data = json.loads(response.text)
  33.         #print(json_data.keys())        
  34.         #print('success:', json_data.get('success'))        
  35.         #print('data:', json_data.get('data'))        
  36.        
  37.         #if json_data.get('type') != 'success':  # WRONG
  38.         if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
  39.             return
  40.        
  41.         #articles = scrapy.Selector(text=json_data.get('html')).css('article')  # WRONG
  42.         articles = scrapy.Selector(text=json_data['data']['content']).css('article')
  43.         for article in articles:
  44.             yield {
  45.                 'title': article.css('h2.entry-title a ::text').extract_first().strip(),
  46.             }
  47.         print('next page >>>')
  48.         yield scrapy.FormRequest(
  49.             self.scrolling_url,
  50.             formdata={
  51.                 'action': "csco_ajax_load_more",
  52.                 'page': str(next_page),
  53.                 "query_vars":"{\"category_name\":\"bollywood-review\",\"error\":\"\",\"m\":\"\",\"p\":0,\"post_parent\":\"\",\"subpost\":\"\",\"subpost_id\":\"\",\"attachment\":\"\",\"attachment_id\":0,\"name\":\"\",\"pagename\":\"\",\"page_id\":0,\"second\":\"\",\"minute\":\"\",\"hour\":\"\",\"day\":0,\"monthnum\":0,\"year\":0,\"w\":0,\"tag\":\"\",\"cat\":7286,\"tag_id\":\"\",\"author\":\"\",\"author_name\":\"\",\"feed\":\"\",\"tb\":\"\",\"paged\":0,\"meta_key\":\"\",\"meta_value\":\"\",\"preview\":\"\",\"s\":\"\",\"sentence\":\"\",\"title\":\"\",\"fields\":\"\",\"menu_order\":\"\",\"embed\":\"\",\"category__in\":[],\"category__not_in\":[],\"category__and\":[],\"post__in\":[],\"post__not_in\":[36896,40871,34019,37264,44457,44397],\"post_name__in\":[],\"tag__in\":[],\"tag__not_in\":[],\"tag__and\":[],\"tag_slug__in\":[],\"tag_slug__and\":[],\"post_parent__in\":[],\"post_parent__not_in\":[],\"author__in\":[],\"author__not_in\":[],\"ignore_sticky_posts\":false,\"suppress_filters\":false,\"cache_results\":true,\"update_post_term_cache\":true,\"lazy_load_term_meta\":true,\"update_post_meta_cache\":true,\"post_type\":\"\",\"posts_per_page\":12,\"nopaging\":false,\"comments_per_page\":\"50\",\"no_found_rows\":false,\"order\":\"DESC\"}",
  54.                 "query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
  55.             },
  56.             callback=self.parse_page,
  57.             meta={'page': next_page},
  58.         )
  59.  
  60. # --- run without project and save in `output.csv` ---
  61.  
  62. from scrapy.crawler import CrawlerProcess
  63.  
  64. c = CrawlerProcess({
  65.     'USER_AGENT': 'Mozilla/5.0',
  66.  
  67.     # save in file CSV, JSON or XML
  68.     'FEED_FORMAT': 'csv',     # csv, json, xml
  69.     'FEED_URI': 'output.csv', #
  70. })
  71.  
  72. c.crawl(InfiniteScrollingSpider)
  73. c.start()
Add Comment
Please, Sign In to add comment