Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- [CacheOutput(ClientTimeSpan = 120, ServerTimeSpan = 120)]
- [HttpPost]
- [Route("api/Webscraper/SearchHardwareOnlineForKeywords")]
- public async Task<IHttpActionResult> SearchHardwareOnlineForKeywords(HardwareOnlineRequestModel model)
- {
- if (model.MaxThreadsToScan > 500)
- {
- return BadRequest("Max number of threads to scan is limited to 500 to keep up a good performance");
- }
- if (model.Keywords.Count == 0)
- {
- return BadRequest("You have to provide at least one keyword/sentence to scan for");
- }
- var client = new HttpClient();
- var doc = new HtmlAgilityPack.HtmlDocument();
- string url = model.SalesSection ? "https://www.hardwareonline.dk/koebsalgoversigt.aspx" : "http://www.hol.dk";
- var response = await client.GetStringAsync(url);
- doc.LoadHtml(response);
- if (model.SalesSection)
- {
- List<HardwareOnlineScraper> foundNodes = new List<HardwareOnlineScraper>();
- var target = doc.GetElementbyId("ContentPlaceHolder_ContentPlaceHolder_data").Descendants("div").Where(d => d.Attributes.Contains("class") && (d.Attributes["class"].Value.Contains("ks-oversigt") || d.Attributes["class"].Value.Contains("ks-oversigt-odd"))).Take(500);
- for (var i = 0; i < target.Count(); ++i)
- {
- var title = target.ElementAt(i).ChildNodes[0].FirstChild;
- var timestamp = target.ElementAt(i).ChildNodes[2].FirstChild.InnerText;
- if (model.LimitedToLastTwoDays && (!timestamp.Contains("I dag") && !timestamp.Contains("I går")))
- {
- continue;
- }
- string parsed = title.InnerText.ToLower().Replace("æ", "æ").Replace("å", "å").Replace("ø", "ø");
- if (model.Keywords.Any(v => parsed.Contains(v)))
- {
- var user = target.ElementAt(i).ChildNodes[1].FirstChild.InnerText;
- var zip = target.ElementAt(i).ChildNodes[3].FirstChild.InnerText;
- foundNodes.Add(new HardwareOnlineSalesModel
- {
- Title = parsed,
- ThreadUrl = "https://www.hardwareonline.dk" + title.GetAttributeValue("href", string.Empty),
- Timestamp = timestamp,
- Zip = zip,
- User = user,
- UserUrl = "https://www.hardwareonline.dk" + target.ElementAt(i).ChildNodes[1].FirstChild.GetAttributeValue("href", string.Empty)
- });
- }
- }
- return Ok(foundNodes);
- }
- else
- {
- List<HardwareOnlineScraper> foundNodes = new List<HardwareOnlineScraper>();
- var target = doc.DocumentNode.Descendants("div").Where(d => d.Attributes.Contains("class") && d.Attributes["class"].Value.Contains("pure-g"));
- for (var i = 1; i < target.Count(); ++i)
- {
- var current = target.ElementAt(i).ChildNodes[1].FirstChild;
- string parsed = current.InnerText.ToLower().Replace("æ", "æ").Replace("å", "å").Replace("ø", "ø");
- if (model.Keywords.Any(v => parsed.Contains(v)))
- {
- foundNodes.Add(new HardwareOnlineScraper
- {
- Title = parsed,
- ThreadUrl = "https://www.hardwareonline.dk" + current.GetAttributeValue("href", string.Empty)
- });
- }
- }
- return Ok(foundNodes);
- }
- }
Add Comment
Please, Sign In to add comment