Advertisement
rhat398

code

Jul 29th, 2022
281
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 27.40 KB | None | 0 0
  1. import scrapy
  2. import requests
  3. import json
  4.  
  5. from olx_egypt.items import OlxEgyptItem
  6.  
  7.  
  8. class OlxScraper(scrapy.Spider):
  9. name = "olx-eg-scraper"
  10.  
  11. custom_settings = {
  12. "FEED_FORMAT": "csv",
  13. "FEED_URI": "olx.csv",
  14. "LOG_FILE": "olx_eg.log",
  15. # "ITEM_PIPELINES": {"olx_egypt.pipelines.OlxEgPipeline": 300},
  16. }
  17.  
  18. listing_endpoint = "https://search.olx.com.eg/_msearch?filter_path=took%2C*.took%2C*.suggest.*.options.text%2C*.suggest.*.options._source.*%2C*.hits.total.*%2C*.hits.hits._source.*%2C*.hits.hits.highlight.*%2C*.error%2C*.aggregations.*.buckets.key%2C*.aggregations.*.buckets.doc_count%2C*.aggregations.*.buckets.complex_value.hits.hits._source%2C*.aggregations.*.filtered_agg.facet.buckets.key%2C*.aggregations.*.filtered_agg.facet.buckets.doc_count%2C*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
  19.  
  20. headers = {
  21. "authority": "search.olx.com.eg",
  22. "accept": "*/*",
  23. "accept-language": "en,ru;q=0.9",
  24. "authorization": "Basic b2x4LWVnLXByb2R1Y3Rpb24tc2VhcmNoOn1nNDM2Q0R5QDJmWXs2alpHVGhGX0dEZjxJVSZKbnhL",
  25. "content-type": "application/x-ndjson",
  26. "origin": "https://www.olx.com.eg",
  27. "referer": "https://www.olx.com.eg/",
  28. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Yandex";v="22"',
  29. "sec-ch-ua-mobile": "?0",
  30. "sec-ch-ua-platform": '"Linux"',
  31. "sec-fetch-dest": "empty",
  32. "sec-fetch-mode": "cors",
  33. "sec-fetch-site": "same-site",
  34. "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.143 YaBrowser/22.5.0.1879 (beta) Yowser/2.5 Safari/537.36",
  35. }
  36.  
  37. cities = [
  38. "1-5",
  39. # "1-68",
  40. # "1-6",
  41. # "1-71",
  42. # "1-65",
  43. # "1-57",
  44. # "1-60",
  45. # "1-66",
  46. # "1-59",
  47. # "1-70",
  48. # "1-67",
  49. # "1-50",
  50. # "1-73",
  51. # "1-69",
  52. # "1-62",
  53. # "1-58",
  54. # "1-55",
  55. # "1-61",
  56. # "1-64",
  57. # "1-72",
  58. # "1-63",
  59. # "1-77",
  60. # "1-56",
  61. # "1-76",
  62. # "1-54",
  63. # "1-74",
  64. ]
  65.  
  66. city_ids = [
  67. # "2-288",
  68. # "2-381",
  69. # "2-408",
  70. # "2-332",
  71. # "2-287",
  72. # "2-320",
  73. # "2-322",
  74. # "2-263",
  75. # "2-382",
  76. # "2-319",
  77. # "2-380",
  78. # "2-269",
  79. # "2-398",
  80. # "2-359",
  81. # "2-424",
  82. # "2-385",
  83. # "2-286",
  84. # "2-321",
  85. # "2-140",
  86. # "2-295",
  87. # "2-134",
  88. # "2-139",
  89. # "2-133",
  90. # "2-131",
  91. # "2-333",
  92. # "2-393",
  93. # "2-391",
  94. # "2-132",
  95. # "2-383",
  96. # "2-144",
  97. # "2-401",
  98. # "2-289",
  99. # "2-389",
  100. # "2-386",
  101. # "2-292",
  102. # "2-136",
  103. # "2-143",
  104. # "2-150",
  105. # "2-397",
  106. "2-142",
  107. # "2-217",
  108. # "2-205",
  109. # "2-212",
  110. # "2-213",
  111. # "2-417",
  112. # "2-230",
  113. # "2-194",
  114. # "2-210",
  115. # "2-211",
  116. # "2-413",
  117. # "2-384",
  118. # "2-222",
  119. # "2-197",
  120. # "2-198",
  121. # "2-208",
  122. # "2-195",
  123. # "2-414",
  124. # "2-221",
  125. # "2-233",
  126. # "2-207",
  127. # "2-204",
  128. # "2-199",
  129. # "2-219",
  130. # "2-201",
  131. # "2-224",
  132. # "2-229",
  133. # "2-200",
  134. # "2-202",
  135. # "2-227",
  136. # "2-196",
  137. # "2-411",
  138. # "2-206",
  139. # "2-422",
  140. # "2-203",
  141. # "2-228",
  142. # "2-317",
  143. # "2-309",
  144. # "2-301",
  145. # "2-303",
  146. # "2-113",
  147. # "2-318",
  148. # "2-302",
  149. # "2-312",
  150. # "2-119",
  151. # "2-304",
  152. # "2-299",
  153. # "2-100",
  154. # "2-390",
  155. # "2-308",
  156. # "2-412",
  157. # "2-307",
  158. # "2-109",
  159. # "2-297",
  160. # "2-416",
  161. # "2-124",
  162. # "2-106",
  163. # "2-99",
  164. # "2-300",
  165. # "2-306",
  166. # "2-305",
  167. # "2-115",
  168. # "2-116",
  169. # "2-121",
  170. # "2-311",
  171. # "2-112",
  172. # "2-362",
  173. # "2-104",
  174. # "2-125",
  175. # "2-105",
  176. # "2-107",
  177. # "2-111",
  178. # "2-96",
  179. # "2-114",
  180. # "2-117",
  181. # "2-95",
  182. # "2-387",
  183. # "2-172",
  184. # "2-174",
  185. # "2-171",
  186. # "2-173",
  187. # "2-178",
  188. # "2-175",
  189. # "2-177",
  190. # "2-79",
  191. # "2-165",
  192. # "2-77",
  193. # "2-94",
  194. # "2-166",
  195. # "2-78",
  196. # "2-365",
  197. # "2-159",
  198. # "2-377",
  199. # "2-373",
  200. # "2-420",
  201. # "2-376",
  202. # "2-370",
  203. # "2-368",
  204. # "2-371",
  205. # "2-363",
  206. # "2-372",
  207. # "2-375",
  208. # "2-367",
  209. # "2-366",
  210. # "2-369",
  211. # "2-157",
  212. # "2-158",
  213. # "2-364",
  214. # "2-374",
  215. # "2-419",
  216. # "2-40",
  217. # "2-34",
  218. # "2-35",
  219. # "2-37",
  220. # "2-42",
  221. # "2-32",
  222. # "2-27",
  223. # "2-26",
  224. # "2-39",
  225. # "2-36",
  226. # "2-38",
  227. # "2-28",
  228. # "2-33",
  229. # "2-30",
  230. # "2-41",
  231. # "2-29",
  232. # "2-31",
  233. # "2-80",
  234. # "2-87",
  235. # "2-86",
  236. # "2-84",
  237. # "2-83",
  238. # "2-82",
  239. # "2-85",
  240. # "2-81",
  241. # "2-23",
  242. # "2-19",
  243. # "2-16",
  244. # "2-15",
  245. # "2-164",
  246. # "2-415",
  247. # "2-22",
  248. # "2-17",
  249. # "2-24",
  250. # "2-25",
  251. # "2-20",
  252. # "2-21",
  253. # "2-179",
  254. # "2-409",
  255. # "2-183",
  256. # "2-399",
  257. # "2-182",
  258. # "2-180",
  259. # "2-181",
  260. # "2-184",
  261. # "2-162",
  262. # "2-89",
  263. # "2-92",
  264. # "2-88",
  265. # "2-90",
  266. # "2-91",
  267. # "2-93",
  268. # "2-326",
  269. # "2-328",
  270. # "2-327",
  271. # "2-331",
  272. # "2-329",
  273. # "2-330",
  274. # "2-325",
  275. # "2-245",
  276. # "2-242",
  277. # "2-243",
  278. # "2-247",
  279. # "2-246",
  280. # "2-249",
  281. # "2-251",
  282. # "2-250",
  283. # "2-244",
  284. # "2-248",
  285. # "2-252",
  286. # "2-396",
  287. # "2-189",
  288. # "2-187",
  289. # "2-191",
  290. # "2-193",
  291. # "2-185",
  292. # "2-192",
  293. # "2-61",
  294. # "2-57",
  295. # "2-62",
  296. # "2-56",
  297. # "2-54",
  298. # "2-55",
  299. # "2-53",
  300. # "2-60",
  301. # "2-59",
  302. # "2-58",
  303. # "2-6",
  304. # "2-10",
  305. # "2-14",
  306. # "2-127",
  307. # "2-5",
  308. # "2-9",
  309. # "2-378",
  310. # "2-11",
  311. # "2-2",
  312. # "2-12",
  313. # "2-3",
  314. # "2-1",
  315. # "2-8",
  316. # "2-7",
  317. # "2-13",
  318. # "2-4",
  319. # "2-126",
  320. # "2-404",
  321. # "2-339",
  322. # "2-341",
  323. # "2-344",
  324. # "2-337",
  325. # "2-343",
  326. # "2-342",
  327. # "2-346",
  328. # "2-340",
  329. # "2-345",
  330. # "2-338",
  331. # "2-50",
  332. # "2-51",
  333. # "2-43",
  334. # "2-49",
  335. # "2-45",
  336. # "2-52",
  337. # "2-48",
  338. # "2-47",
  339. # "2-46",
  340. # "2-44",
  341. # "2-74",
  342. # "2-76",
  343. # "2-160",
  344. # "2-161",
  345. # "2-73",
  346. # "2-75",
  347. # "2-72",
  348. # "2-235",
  349. # "2-241",
  350. # "2-234",
  351. # "2-239",
  352. # "2-240",
  353. # "2-238",
  354. # "2-236",
  355. # "2-237",
  356. # "2-163",
  357. # "2-392",
  358. # "2-63",
  359. # "2-67",
  360. # "2-64",
  361. # "2-66",
  362. # "2-69",
  363. # "2-71",
  364. # "2-65",
  365. # "2-68",
  366. # "2-70",
  367. # "2-274",
  368. # "2-277",
  369. # "2-418",
  370. # "2-316",
  371. # "2-282",
  372. # "2-281",
  373. # "2-276",
  374. # "2-283",
  375. # "2-284",
  376. # "2-278",
  377. # "2-279",
  378. # "2-280",
  379. # "2-285",
  380. # "2-350",
  381. # "2-356",
  382. # "2-347",
  383. # "2-351",
  384. # "2-357",
  385. # "2-349",
  386. # "2-353",
  387. # "2-348",
  388. # "2-354",
  389. # "2-355",
  390. # "2-265",
  391. # "2-268",
  392. # "2-271",
  393. # "2-272",
  394. # "2-270",
  395. # "2-315",
  396. # "2-266",
  397. # "2-167",
  398. # "2-334",
  399. # "2-170",
  400. # "2-336",
  401. # "2-169",
  402. # "2-335",
  403. # "2-168",
  404. # "2-258",
  405. # "2-253",
  406. # "2-254",
  407. # "2-256",
  408. ]
  409.  
  410. post_data = [
  411. '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl2":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl1.externalID":"{city}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl2.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl2"]}}}}}}}}}}}}}}}}}},"location.lvl3":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl2.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl3.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl3"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":45,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":581}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{page},"size":{size},"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"sort":[{{"timestamp":{{"order":"desc"}}}},{{"id":{{"order":"desc"}}}}]}}\n',
  412. '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl2":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl1.externalID":"{city}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl2.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl2"]}}}}}}}}}}}}}}}}}},"location.lvl3":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl2.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl3.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl3"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":45,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":234}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{page},"size":{size},"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"sort":[{{"_score":{{"order":"desc"}}}}]}}\n',
  413. '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl2":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl1.externalID":"{city}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl2.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl2"]}}}}}}}}}}}}}}}}}},"location.lvl3":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl2.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl3.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl3"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":45,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":936}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{page},"size":{size},"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"sort":[{{"extraFields.price":{{"order":"asc"}}}},{{"extraFields.salary_to":{{"order":"asc"}}}},{{"id":{{"order":"desc"}}}}]}}\n',
  414. '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl2":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl1.externalID":"{city}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl2.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl2"]}}}}}}}}}}}}}}}}}},"location.lvl3":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl2.externalID":"{_id}"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl3.externalID","size":40}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl3"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":45,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":99}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{page},"size":{size},"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.externalID":"{_id}"}}}}]}}}},"sort":[{{"extraFields.price":{{"order":"desc"}}}},{{"extraFields.salary_to":{{"order":"desc"}}}},{{"id":{{"order":"desc"}}}}]}}\n',
  415. ]
  416.  
  417. def start_requests(self):
  418. page = 0
  419. size = 45
  420. for data in self.post_data:
  421. for city in self.cities:
  422. for _id in self.city_ids:
  423. meta = dict(data=data, main_city=city, sub_city=_id)
  424.  
  425. yield scrapy.Request(
  426. url=self.listing_endpoint,
  427. headers=self.headers,
  428. body=data.format(city=city, _id=_id, page=page, size=size),
  429. callback=self.parse_links,
  430. meta=meta,
  431. )
  432.  
  433. def parse_links(self, response):
  434. size = 45
  435. meta = {
  436. "data": response.meta["data"],
  437. "main_city": response.meta["main_city"],
  438. "sub_city": response.meta["sub_city"],
  439. }
  440. listing_data = {}
  441. total = response.json()["responses"][2]["hits"]["total"]["value"]
  442. if total > 9955:
  443. for i in range(0, 9955):
  444. page = i + 1
  445. yield response.follow(
  446. url=self.listing_endpoint,
  447. headers=self.headers,
  448. body=meta["data"].format(
  449. city=meta["main_city"],
  450. _id=meta["sub_city"],
  451. page=page,
  452. size=size,
  453. ),
  454. callback=self.parse_links,
  455. meta=meta,
  456. )
  457. else:
  458.  
  459. for i in range(0, total):
  460. page = i + 1
  461. yield response.follow(
  462. url=self.listing_endpoint,
  463. headers=self.headers,
  464. body=meta["data"].format(
  465. city=meta["main_city"],
  466. _id=meta["sub_city"],
  467. page=page,
  468. size=size,
  469. ),
  470. callback=self.parse_links,
  471. meta=meta,
  472. )
  473. try:
  474. listing_data["data"] = response.json()["responses"][1]["hits"]["hits"]
  475. except:
  476. pass
  477. try:
  478. listing_data["data"] = response.json()["responses"][2]["hits"]["hits"]
  479. except:
  480. pass
  481. try:
  482. for listing in listing_data["data"]:
  483. listing_id = listing["_source"]["externalID"]
  484. listing_url = "https://www.olx.com.eg/en/ad/" + listing_id
  485.  
  486. yield scrapy.Request(
  487. url=listing_url,
  488. headers=self.headers,
  489. callback=self.parse_details,
  490. meta={
  491. "listing_url": listing_url,
  492. },
  493. )
  494. except:
  495. pass
  496.  
  497. def parse_details(self, response):
  498. item = {}
  499. data = "".join(
  500. response.css("script::text")[6]
  501. .get()
  502. .replace("<script>", "")
  503. .replace("window.state = ", "")
  504. .split(";")[:-7]
  505. )
  506. json_data = json.loads(data)
  507. try:
  508. reference_id = (
  509. response.css("div._171225da::text").get().replace("Ad id ", "")
  510. )
  511. except:
  512. reference_id = ""
  513. sub_detail_list = response.css("div._676a547f ::text").extract()
  514.  
  515. item["URL"] = response.meta.get("listing_url")
  516. try:
  517. item["Breadcrumb"] = (
  518. response.css("li._8c543153 ::text")[4].get()
  519. + "/"
  520. + response.css("li._8c543153 ::text")[3].get()
  521. + "/"
  522. + response.css("li._8c543153 ::text")[2].get()
  523. + "/"
  524. + response.css("li._8c543153 ::text")[1].get()
  525. + "/"
  526. + response.css("li._8c543153 ::text").get()
  527. )
  528. except:
  529. item["Breadcrumb"] = (
  530. +response.css("li._8c543153 ::text")[3].get()
  531. + "/"
  532. + response.css("li._8c543153 ::text")[2].get()
  533. + "/"
  534. + response.css("li._8c543153 ::text")[1].get()
  535. + "/"
  536. + response.css("li._8c543153 ::text").get()
  537. )
  538.  
  539. item["Price"] = response.css("span._56dab877 ::text").get()
  540. item["Title"] = response.css("h1.a38b8112::text").get()
  541. item["Type"] = ""
  542. try:
  543. item["Bedrooms"] = json_data["ad"]["data"]["extraFields"]["rooms"]
  544. except:
  545. item["Bedrooms"] = ""
  546. try:
  547. item["Bathrooms"] = json_data["ad"]["data"]["extraFields"]["bathrooms"]
  548. except:
  549. item["Bathrooms"] = ""
  550. try:
  551. item["Area"] = json_data["ad"]["data"]["extraFields"]["ft"]
  552. except:
  553. item["Area"] = ""
  554. # try:
  555. # item["Area"] = response.css("span.c47715cd::text")[2].get()
  556. # except:
  557. # for sub in sub_detail_list:
  558. # if "Area (m²)" in sub_detail_list:
  559. # item["Area"] = sub_detail_list[
  560. # sub_detail_list.index("Area (m²)") + 1
  561. # ]
  562. # else:
  563. # item["Area"] = ""
  564. item["Location"] = response.css("span._8918c0a8::text").get()
  565. try:
  566. if response.css("div.b44ca0b3 ::text")[18].get() == "Compound":
  567. item["Compound"] = response.css("div.b44ca0b3 ::text")[19].get()
  568. elif response.css("div.b44ca0b3 ::text")[16].get() == "Compound":
  569. item["Compound"] = response.css("div.b44ca0b3 ::text")[17].get()
  570. except:
  571. item["Compound"] = ""
  572. item["seller"] = response.css("span._261203a9._2e82a662::text").getall()[1]
  573. member_since = response.css("span._34a7409b ::text")[1].get()
  574. if member_since == "Cars for Sale":
  575. item["Seller_member_since"] = response.css("span._34a7409b ::text").get()
  576. if "Commercial ID: " in member_since:
  577. item["Seller_member_since"] = response.css("span._34a7409b ::text")[2].get()
  578. else:
  579. item["Seller_member_since"] = member_since
  580. res = requests.get(
  581. f"https://www.olx.com.eg/api/listing/{reference_id}/contactInfo/"
  582. )
  583. item["Seller_phone_number"] = res.json()["mobile"]
  584. # item["Seller_phone_number"] = json_data["sellerProfile"]["data"]["phoneNumber"]
  585. item["Description"] = (
  586. response.css("div._0f86855a ::text").get().replace("\n", "")
  587. )
  588. item["Amenities"] = ",".join(response.css("div._27f9c8ac ::text").extract())
  589. item["Reference"] = reference_id
  590. item["Listed_date"] = response.css("span._8918c0a8 ::text")[1].get()
  591. item["Level"] = ""
  592. item["Payment_option"] = ""
  593. item["Delivery_term"] = ""
  594. item["Furnished"] = ""
  595. item["Delivery_date"] = ""
  596. item["Down_payment"] = ""
  597.  
  598. for sub_detail in sub_detail_list:
  599. if "Type" in sub_detail_list:
  600. item["Type"] = sub_detail_list[sub_detail_list.index("Type") + 1]
  601. if "Level" in sub_detail_list:
  602. item["Level"] = sub_detail_list[sub_detail_list.index("Level") + 1]
  603. if "Payment Option" in sub_detail_list:
  604. item["Payment_option"] = sub_detail_list[
  605. sub_detail_list.index("Payment Option") + 1
  606. ]
  607. if "Delivery Term" in sub_detail_list:
  608. item["Delivery_term"] = sub_detail_list[
  609. sub_detail_list.index("Delivery Term") + 1
  610. ]
  611. if "Furnished" in sub_detail_list:
  612. item["Furnished"] = sub_detail_list[
  613. sub_detail_list.index("Furnished") + 1
  614. ]
  615. if "Delivery Date" in sub_detail_list:
  616. item["Delivery_date"] = sub_detail_list[
  617. sub_detail_list.index("Delivery Date") + 1
  618. ]
  619. if "Down Payment" in sub_detail_list:
  620. item["Down_payment"] = sub_detail_list[
  621. sub_detail_list.index("Down Payment") + 1
  622. ]
  623.  
  624. item["Image_url"] = response.css("picture._219b7e0a ::attr(srcset)")[1].get()
  625. item["Lat"] = json_data["ad"]["data"]["geography"]["lat"]
  626. item["Long"] = json_data["ad"]["data"]["geography"]["lng"]
  627.  
  628. yield item
  629.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement