Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rlslog_webCrawler("http://www.rlslog.net/category/ebooks/ebook/page/", "html/body/div[2]/div[2]/div[*]/h3/a", "html/body/div[2]/div[2]/div[1]", "userscloud.com", "e:\tmp\")
- Func rlslog_webCrawler($url, $header_xpath, $header_xpath_nextPageText, $search_term, $save_urls_dir)
- Local $pageLoadingTime = "", $headers, $start, $finish, $countingPages, $countingHeaders
- Local $aLinks, $aDonwloadURLs[1], $hFileOpen, $tmp_array[1], $header_name_array, $header_name, $header_url, $section_text, $errors_array[1], $errors_flag = 0
- $start = InputBox("Start page", "Enter start page number", "1")
- $finish = InputBox("End page", "Enter end page number", "2654")
- $countingPages = $finish - $start + 1
- If _FFConnect() Then
- For $k = $start To $finish
- $countingPages -= 1
- _FFOpenURL($url & $k, False)
- $pageLoadingTime = _FFLoadWait(15000, 30000)
- If $pageLoadingTime = 0 Then
- $errors_flag = 1
- MsgBox(0, "Error:", "Page load time limit broken" & @CRLF & $url & $k, 3)
- _ArrayAdd($errors_array, @YEAR & @MON & @MDAY & "_" & _DateDayOfWeek(@WDAY, $DMW_SHORTNAME) & "_" & @HOUR & "-" & @MIN & "-" & @SEC & "-" & @MSEC & "_str_" & $k & "_pageLoadTimeError" & @TAB & $url & $k, Default, Default, Default, $ARRAYFILL_FORCE_SINGLEITEM)
- EndIf
- $headers = _FFXPath($header_xpath, "", 7)
- If IsArray($headers) Then
- ;_ArrayDisplay($headers)
- $countingHeaders = UBound($headers) - 1
- For $i = 1 To UBound($headers) - 1
- $header_url = $headers[$i]
- $header_name_array = StringSplit($headers[$i], "/")
- If IsArray($header_name_array) Then
- ;_ArrayDisplay($header_name_array, "$header_name")
- $header_name = $header_name_array[4]
- EndIf
- $countingHeaders -= 1
- ToolTip("Current page " & $k & " / " & $finish & @CRLF & "Remaining: " & $countingPages & " pages" & @CRLF & @CRLF & _
- "Loading headers " & $i & " / " & UBound($headers) - 1 & @CRLF & "Remaining: " & $countingHeaders & " pages", 0, 0)
- _FFOpenURL($headers[$i], False)
- $pageLoadingTime = _FFLoadWait(15000, 30000)
- If $pageLoadingTime = 0 Then
- $errors_flag = 1
- MsgBox(0, "Error:", "Header page load time limit broken" & @CRLF & $header_url, 3)
- _ArrayAdd($errors_array, @YEAR & @MON & @MDAY & "_" & _DateDayOfWeek(@WDAY, $DMW_SHORTNAME) & "_" & @HOUR & "-" & @MIN & "-" & @SEC & "-" & @MSEC & "_str_" & $k & "_headerPageLoadTimeError" & @TAB & $header_url, Default, Default, Default, $ARRAYFILL_FORCE_SINGLEITEM)
- EndIf
- $section_text = @CRLF & _FFXPath($header_xpath_nextPageText, "value", 2) & @CRLF & @CRLF
- _ArrayAdd($tmp_array, $section_text, Default, Default, Default, $ARRAYFILL_FORCE_SINGLEITEM)
- $aLinks = _FFLinksGetAll()
- If IsArray($aLinks) Then
- ;_ArrayDisplay($aLinks)
- For $j = 0 To UBound($aLinks) - 1
- If StringInStr($aLinks[$j][0], $search_term) Then
- _ArraySearch($aDonwloadURLs, $aLinks[$j][0])
- If @error Then
- _ArrayAdd($tmp_array, $header_name & @TAB & $aLinks[$j][0] & @TAB & $header_url, Default, Default, Default, $ARRAYFILL_FORCE_SINGLEITEM)
- _ArrayAdd($aDonwloadURLs, $aLinks[$j][0], Default, Default, Default, $ARRAYFILL_FORCE_SINGLEITEM)
- EndIf
- EndIf
- Next
- ;_ArrayDisplay($aDonwloadURLs, "$aDonwloadURLs")
- EndIf
- ;MsgBox(0, "Pauza", "pauzi8rano")
- ToolTip("")
- Next
- EndIf
- ;MsgBox(0, "Pauza", "pauzi8rano")
- $hFileOpen = FileOpen($save_urls_dir & @YEAR & @MON & @MDAY & "_" & _DateDayOfWeek(@WDAY, $DMW_SHORTNAME) & "_" & @HOUR & "-" & @MIN & "-" & @SEC & "-" & @MSEC & "_str_" & $k & ".txt", 10)
- _FileWriteFromArray($hFileOpen, $tmp_array, 1)
- FileClose($hFileOpen)
- If $errors_flag = 1 Then
- $hFileOpen = FileOpen($save_urls_dir & "000errors.txt", 9)
- _FileWriteFromArray($hFileOpen, $errors_array, 1)
- FileClose($hFileOpen)
- $errors_flag = 0
- Local $errors_array[1]
- EndIf
- Local $tmp_array[1]
- Next
- _ArrayToClip($aDonwloadURLs, @CRLF, 1)
- _ArrayDisplay($aDonwloadURLs, "$aDonwloadURLs")
- _FFOpenURL($url & $finish, False)
- EndIf
- EndFunc
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement