Advertisement
datatheoz

allareacodes.com Scraper

Sep 24th, 2018
156
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
VB.NET 7.03 KB | None | 0 0
  1. Imports System.IO
  2. Imports System.Threading
  3. Imports Strings
  4. Imports HttpSync
  5.  
  6. Module Module1
  7.     Private codes As New List(Of String)
  8.     Private threadCount As Integer
  9.     Private closedThreads As Integer
  10.     Private ReadOnly threadLocker As New Object
  11.     Private keepRunning As Boolean
  12.     Private totalScraped As Integer
  13.     Private codeIndex As Integer
  14.     Private triggers As New List(Of String)
  15.  
  16.     Sub Main()
  17.         'intro
  18.         Console.WriteLine("Welcome to allareacodes.com scraper.")
  19.         Console.WriteLine("Make sure areacodes.txt exists with desired area codes, one per line.")
  20.         Console.WriteLine("Example:")
  21.         Console.WriteLine("123")
  22.         Console.WriteLine("456")
  23.         Console.WriteLine("789")
  24.         Console.WriteLine()
  25.  
  26.         'area codes
  27.         Console.WriteLine("Press any key to load the file.")
  28.         Console.WriteLine()
  29.  
  30.         Console.ReadKey()
  31.  
  32.         While Not File.Exists("areacodes.txt")
  33.             Console.WriteLine("Could not find file areacodes.txt")
  34.             Console.WriteLine("Create it and press any key to load it.")
  35.             Console.WriteLine()
  36.             Console.ReadKey()
  37.         End While
  38.  
  39.         While New FileInfo("areacodes.txt").Length = 0
  40.             Console.WriteLine("File areacodes.txt exists but it is empty, add something to it.")
  41.             Console.WriteLine("Press any key to load it.")
  42.             Console.WriteLine()
  43.             Console.ReadKey()
  44.         End While
  45.  
  46.         codes.AddRange(File.ReadLines("areacodes.txt"))
  47.         Console.WriteLine($"Loaded {codes.Count} area code/s.")
  48.         Console.WriteLine()
  49.  
  50.         'triggers
  51.         Console.WriteLine("Make sure triggers.txt exists with desired trigger text, one per line.")
  52.         Console.WriteLine("Example:")
  53.         Console.WriteLine("Verizon")
  54.         Console.WriteLine("Sprint")
  55.         Console.WriteLine("etc..")
  56.         Console.WriteLine()
  57.  
  58.         Console.WriteLine("Press any key to load the file.")
  59.         Console.WriteLine()
  60.  
  61.         Console.ReadKey()
  62.  
  63.         While Not File.Exists("triggers.txt")
  64.             Console.WriteLine("Could not find file triggers.txt")
  65.             Console.WriteLine("Create it and press any key to load it.")
  66.             Console.WriteLine()
  67.             Console.ReadKey()
  68.         End While
  69.  
  70.         While New FileInfo("triggers.txt").Length = 0
  71.             Console.WriteLine("File triggers.txt exists but it is empty, add something to it.")
  72.             Console.WriteLine("Press any key to load it.")
  73.             Console.WriteLine()
  74.             Console.ReadKey()
  75.         End While
  76.  
  77.         triggers.AddRange(File.ReadLines("triggers.txt"))
  78.         Console.WriteLine($"Loaded {triggers.Count} trigger/s.")
  79.         Console.WriteLine()
  80.  
  81.         'Threads
  82.         Console.WriteLine("Enter the amount of threads to use.")
  83.         Console.WriteLine("Thread count is 0 based. (Entering 0 will use one thread.)")
  84.         Console.WriteLine()
  85.  
  86.         While Not Integer.TryParse(Console.ReadLine(), threadCount)
  87.             Console.WriteLine("That is not a valid number.")
  88.             Console.WriteLine("Enter the amount of threads to use.")
  89.             Console.WriteLine()
  90.         End While
  91.  
  92.         Console.WriteLine($"Threads set to {threadCount}.")
  93.         Console.WriteLine()
  94.  
  95.         'start
  96.         Console.WriteLine("Press any key to start.")
  97.         Console.ReadKey()
  98.         Console.WriteLine("Key press accepted. Started..")
  99.         Console.WriteLine()
  100.  
  101.         keepRunning = True
  102.         For i As Integer = 0 To threadCount
  103.             Dim t As New Thread(AddressOf StartThread) With {.IsBackground = True}
  104.             t.Start()
  105.         Next
  106.  
  107.         While closedThreads <= threadCount
  108.             Thread.Sleep(10000)
  109.             Console.WriteLine($"{Now.ToLongTimeString} - Scraped {totalScraped.ToThousands}")
  110.             Console.WriteLine()
  111.         End While
  112.  
  113.         Console.WriteLine("All threads have been closed.")
  114.         Console.WriteLine($"Scraped: {totalScraped.ToThousands}")
  115.         Console.WriteLine()
  116.         Console.WriteLine("Press any key to exit.")
  117.         Console.ReadKey()
  118.     End Sub
  119.  
  120.     Private Sub StartThread()
  121.         Dim strCode As String
  122.         Dim strNumber As String
  123.         Dim strProvider As String
  124.  
  125.         Dim h As New HttpProperties
  126.         With h
  127.             .Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
  128.             .AutoDecompress = True
  129.             .AcceptLanguage = "en-US,en;q=0.9"
  130.             .UpgradeInsecureRequestsToDefault()
  131.             .UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
  132.             .CookiesEnabled = False
  133.         End With
  134.  
  135.         While keepRunning
  136.             SyncLock threadLocker
  137.                 If codeIndex >= codes.Count Then
  138.                     Exit While
  139.                 End If
  140.  
  141.                 strCode = codes(codeIndex)
  142.                 codeIndex += 1
  143.             End SyncLock
  144.  
  145.             With h
  146.                 .Url = New Uri($"https://www.allareacodes.com/{strCode}")
  147.             End With
  148.  
  149.             Using r As HttpResponse = HttpRequest.Get(h)
  150.                 If Not r.Success OrElse r.IsEmptyBody Then
  151.                     SyncLock threadLocker
  152.                         Using sw As New StreamWriter("Redo.txt", True)
  153.                             sw.WriteLine(strCode)
  154.                         End Using
  155.                     End SyncLock
  156.  
  157.                     Continue While
  158.                 End If
  159.  
  160.                 Dim b As String = r.ReadBody
  161.  
  162.                 For Each s As String In b.ParseToArray("<div class=""list-group-item"">", "</span>")
  163.                     strNumber = s.Parse("<div class=""col-xs-12 prefix-col1"">", "</div>").Trim
  164.                     strProvider = s.Parse("<div class=""col-xs-12 prefix-col4"">", "</div>").Trim
  165.  
  166.                     If Not strNumber.IsEmpty AndAlso Not strProvider.IsEmpty Then
  167.                         For Each strTrigger In triggers
  168.                             If strProvider.Contains(strTrigger) Then
  169.                                 SyncLock threadLocker
  170.                                     Using sw As New StreamWriter($"{strTrigger}.txt", True)
  171.                                         sw.WriteLine(strNumber)
  172.                                     End Using
  173.                                 End SyncLock
  174.  
  175.                                 Interlocked.Increment(totalScraped)
  176.                                 Exit For
  177.                             End If
  178.                         Next
  179.                     End If
  180.  
  181.                     'Debug.Print(strProvider & " - " & strNumber)
  182.                 Next
  183.             End Using
  184.         End While
  185.  
  186.         SyncLock threadLocker
  187.             closedThreads += 1
  188.             Console.WriteLine($"Closed thread #{closedThreads}")
  189.  
  190.             If closedThreads > threadCount Then
  191.                 Console.WriteLine("Program stopped.")
  192.                 Console.WriteLine()
  193.             End If
  194.         End SyncLock
  195.     End Sub
  196. End Module
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement