Guest User

Untitled

a guest
Jan 15th, 2021
18
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package main
  2.  
  3. import (
  4. "bufio"
  5. "bytes"
  6. "crypto/tls"
  7. "flag"
  8. "fmt"
  9. "github.com/360EntSecGroup-Skylar/excelize"
  10. "github.com/PuerkitoBio/goquery"
  11. "golang.org/x/net/html/charset"
  12. "golang.org/x/text/encoding/charmap"
  13. "golang.org/x/text/transform"
  14. "io/ioutil"
  15. "mvdan.cc/xurls"
  16. "net/http"
  17. "net/url"
  18. "os"
  19. "reflect"
  20. "regexp"
  21. "strconv"
  22. "strings"
  23. "sync"
  24. "time"
  25. )
  26.  
  27. func check(e error) {
  28. if e != nil {
  29. panic(e)
  30. }
  31. }
  32.  
  33. func duration(t time.Time) string {
  34. const (
  35. Decisions = 100 * time.Millisecond
  36. Day = 24 * time.Hour
  37. )
  38. ts := time.Since(t)
  39. sign := time.Duration(1)
  40. if ts < 0 {
  41. sign = -1
  42. ts = -ts
  43. }
  44. ts += +Decisions / 2
  45. d := sign * (ts / Day)
  46. ts = ts % Day
  47. h := ts / time.Hour
  48. ts = ts % time.Hour
  49. m := ts / time.Minute
  50. ts = ts % time.Minute
  51. s := ts / time.Second
  52. ts = ts % time.Second
  53. f := ts / Decisions
  54. return fmt.Sprintf("%dd %dh %dm %d.%ds", d, h, m, s, f)
  55. }
  56.  
  57. func getLogin(line string) string {
  58. r, _ := regexp.Compile(`использован никнейм "(.*?)"`)
  59. matches := r.FindStringSubmatch(line)
  60. if matches == nil {
  61. return "null"
  62. }
  63. return matches[1]
  64. }
  65.  
  66. func getDomainLevel(host interface{}) int {
  67. hostSlice := strings.Split(fmt.Sprintf("%v", host), ".")
  68. if hostSlice[0] == "www" {
  69. return len(hostSlice) - 1
  70. } else {
  71. return len(hostSlice)
  72. }
  73. }
  74.  
  75. func toCharStr(i int) string {
  76. return string(rune('A' - 1 + i))
  77. }
  78.  
  79. // Stripos - php:stripos()
  80. func Stripos(haystack, needle string, offset int) int {
  81. length := len(haystack)
  82. if length == 0 || offset > length || -offset > length {
  83. return -1
  84. }
  85. haystack = haystack[offset:]
  86. if offset < 0 {
  87. offset += length
  88. }
  89. pos := strings.Index(strings.ToLower(haystack), strings.ToLower(needle))
  90. if pos == -1 {
  91. return -1
  92. }
  93. return pos + offset
  94. }
  95.  
  96. func sortingHrefArrFollow(hrefsRel map[int]map[string]string, fileLinksList []string) []string {
  97. result := make([]string, 0)
  98. for _, href := range fileLinksList {
  99. link := ""
  100. rel := ""
  101. for _, v1 := range hrefsRel {
  102. if Stripos(v1["href"], href, 0) != -1 {
  103. link = v1["href"]
  104. rel = v1["rel"]
  105. break
  106. }
  107. }
  108. if link != "" {
  109. if Stripos(rel, "nofollow", 0) != -1 {
  110. result = append(result, "nofollow")
  111. } else {
  112. result = append(result, "dofollow")
  113. }
  114. } else {
  115. result = append(result, "no_link")
  116. }
  117. }
  118. return result
  119. }
  120.  
  121. func extractLinks(doc *goquery.Document, sourceUrl string) map[int]map[string]string {
  122. data := map[int]map[string]string{}
  123. doc.Find("a[href]").Each(func(index int, item *goquery.Selection) {
  124. href, ok := item.Attr("href")
  125. if ok {
  126. urlParsed, _ := url.Parse(strings.TrimSpace(href))
  127. sourceUrlParsed, _ := url.Parse(strings.TrimSpace(sourceUrl))
  128. if urlParsed != nil && sourceUrlParsed != nil {
  129. if urlParsed.Hostname() != "" && sourceUrlParsed.Hostname() != "" {
  130. if len(urlParsed.Hostname()) > 4 && sourceUrlParsed.Hostname() != urlParsed.Hostname() {
  131. data[index] = map[string]string{}
  132. data[index]["href"] = urlParsed.Hostname() + urlParsed.Path
  133. data[index]["rel"] = item.AttrOr("rel", "none")
  134. }
  135. }
  136. }
  137. }
  138. })
  139. return data
  140. }
  141.  
  142. func fetchUrl(lines chan string, wg *sync.WaitGroup, chSuccessUrls chan map[string]interface{}, chFailedUrls chan map[string]interface{}, chIsFinished chan bool, fileLinksList []string) {
  143.  
  144. for line := range lines {
  145.  
  146. urlString := xurls.Relaxed().FindString(line)
  147. login := getLogin(line)
  148. password := Password
  149. recaptha := strings.Contains(line, "ReCaptcha2")
  150.  
  151. data := make(map[string]interface{})
  152.  
  153. // http request
  154. client := &http.Client{
  155. Timeout: time.Duration(SiteTimeout) * time.Second,
  156. CheckRedirect: func(req *http.Request, via []*http.Request) error {
  157. return http.ErrUseLastResponse
  158. },
  159. }
  160. http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{
  161. InsecureSkipVerify: true,
  162. DynamicRecordSizingDisabled: true,
  163. }
  164. request, _ := http.NewRequest("GET", strings.TrimSpace(urlString), nil)
  165. request.Header.Set("User-Agent", userAgent)
  166. request.Header.Add("Accept-Charset", "utf-8")
  167. response, getError := client.Do(request)
  168. request.Close = true
  169. // http request
  170.  
  171. if getError != nil {
  172. data["url"] = urlString
  173. chFailedUrls <- data
  174. chIsFinished <- true
  175. continue
  176. }
  177.  
  178. if response != nil || response.StatusCode == http.StatusOK {
  179.  
  180. contentType := response.Header.Get("Content-Type")
  181. utf8reader, utf8readerError := charset.NewReader(response.Body, contentType)
  182.  
  183. if utf8readerError != nil {
  184. data["url"] = urlString
  185. chFailedUrls <- data
  186. chIsFinished <- true
  187. continue
  188. }
  189.  
  190. body, bodyReadError := ioutil.ReadAll(utf8reader)
  191.  
  192. if bodyReadError != nil {
  193. data["url"] = urlString
  194. chFailedUrls <- data
  195. chIsFinished <- true
  196. continue
  197. }
  198.  
  199. bodyString := bytes.NewReader(body)
  200.  
  201. document, documentError := goquery.NewDocumentFromReader(bodyString)
  202.  
  203. if documentError != nil {
  204. data["url"] = urlString
  205. chFailedUrls <- data
  206. chIsFinished <- true
  207. continue
  208. }
  209.  
  210. data["url"] = strings.TrimSpace(urlString)
  211.  
  212. urlParsed, urlParsedError := url.Parse(strings.TrimSpace(urlString))
  213.  
  214. if urlParsedError != nil {
  215. data["url"] = urlString
  216. chFailedUrls <- data
  217. chIsFinished <- true
  218. continue
  219. }
  220.  
  221. data["host"] = strings.TrimSpace(urlParsed.Hostname())
  222.  
  223. links := extractLinks(document, urlString)
  224.  
  225. relType := sortingHrefArrFollow(links, fileLinksList)
  226.  
  227. data["href_rel"] = relType
  228.  
  229. data["domain_level"] = strconv.Itoa(getDomainLevel(data["host"]))
  230.  
  231. data["status_code"] = strconv.Itoa(response.StatusCode)
  232.  
  233. if lang, _ := document.Find("html").First().Attr("lang"); lang != "" {
  234. data["lang"] = lang
  235. } else {
  236. data["lang"] = "null"
  237. }
  238.  
  239. if metaRobots, _ := document.Find("meta[name=robots]").First().Attr("content"); metaRobots != "" {
  240. data["meta_robots"] = metaRobots
  241. } else {
  242. data["meta_robots"] = "null"
  243. }
  244.  
  245. if title := strings.TrimSpace(document.Find("title").Text()); title != "" {
  246. data["title"] = title
  247. } else {
  248. data["title"] = "null"
  249. }
  250.  
  251. if h1 := strings.TrimSpace(document.Find("h1").First().Text()); h1 != "" {
  252. data["h1"] = h1
  253. } else {
  254. data["h1"] = "null"
  255. }
  256.  
  257. data["login"] = login
  258. data["password"] = password
  259. data["recaptcha"] = strconv.FormatBool(recaptha)
  260. data["yandex_iks"] = GetYandexIks(data["host"])
  261.  
  262. chSuccessUrls <- data
  263. chIsFinished <- true
  264. continue
  265. //fmt.Println(urlString, login, password, recaptha, response.StatusCode)
  266.  
  267. }
  268.  
  269. }
  270.  
  271. wg.Done()
  272.  
  273. }
  274.  
  275. func main() {
  276.  
  277. flag.Usage = func() {
  278. fmt.Printf("Usage: %s [OPTIONS] argument ...\n", os.Args[0])
  279. flag.PrintDefaults()
  280. }
  281.  
  282. flag.IntVar(&Workers, "w", Workers, "количество потоков")
  283. flag.IntVar(&ReportPeriod, "r", ReportPeriod, "частота отчетов (сек)")
  284. flag.StringVar(&StartFile, "f", StartFile, "файл отчета")
  285. flag.StringVar(&Password, "p", Password, "пароль использованный при регистрации")
  286. flag.IntVar(&SiteTimeout, "t", SiteTimeout, "максимальное время ответа сайта (сек)")
  287. flag.Parse()
  288.  
  289. if Password == "" {
  290. flag.Usage()
  291. os.Exit(1)
  292. }
  293.  
  294. start := time.Now()
  295. lines := make(chan string, 0)
  296. fileLinesList := make(map[int]string, 0)
  297. fileLinksList := make([]string, 0)
  298. chFailedUrls := make(chan map[string]interface{})
  299. chSuccessUrls := make(chan map[string]interface{})
  300. chIsFinished := make(chan bool)
  301. failedUrls := make([]interface{}, 0)
  302. successUrls := make([]interface{}, 0)
  303. endData := make([]map[string]interface{}, 0)
  304. linesCount := 0
  305. totalUrlsCount := 0
  306. successUrlsCount := 0
  307. failedUrlsCount := 0
  308. var wg sync.WaitGroup
  309.  
  310. ticker := time.NewTicker(time.Duration(ReportPeriod) * time.Second)
  311. defer ticker.Stop()
  312.  
  313. startFile, startFileError := os.Open(FilesPath + StartFile)
  314.  
  315. check(startFileError)
  316.  
  317. defer startFile.Close()
  318.  
  319. startFileDecoder := transform.NewReader(startFile, charmap.Windows1251.NewDecoder())
  320. startFileScanner := bufio.NewScanner(startFileDecoder)
  321.  
  322. for startFileScanner.Scan() {
  323. line := startFileScanner.Text()
  324. fileLinesList[linesCount] = line
  325. linesCount++
  326. }
  327.  
  328. linksFile, linksFileError := os.Open(FilesPath + LinksFile)
  329.  
  330. check(linksFileError)
  331.  
  332. defer linksFile.Close()
  333.  
  334. linksFileDecoder := transform.NewReader(linksFile, charmap.Windows1251.NewDecoder())
  335. linksFileScanner := bufio.NewScanner(linksFileDecoder)
  336.  
  337. for linksFileScanner.Scan() {
  338. link := linksFileScanner.Text()
  339. fileLinksList = append(fileLinksList, link)
  340. }
  341.  
  342. fmt.Println()
  343. fmt.Println("Ссылок в отчете: ", linesCount)
  344. fmt.Println()
  345.  
  346. for i := 0; i < Workers; i++ {
  347. wg.Add(1)
  348. go fetchUrl(lines, &wg, chSuccessUrls, chFailedUrls, chIsFinished, fileLinksList)
  349. }
  350.  
  351. go func() {
  352. for i := 0; i < linesCount; {
  353. select {
  354. case data := <-chFailedUrls:
  355. failedUrls = append(failedUrls, data["url"])
  356. failedUrlsCount++
  357. case data := <-chSuccessUrls:
  358. successUrls = append(successUrls, data["url"])
  359. endData = append(endData, data)
  360. successUrlsCount++
  361. case <-chIsFinished:
  362. i++
  363. totalUrlsCount++
  364. case <-ticker.C:
  365. fmt.Printf("Processed %d of %d (%d records / sec) | Success urls: %d | Failed urls: %d | Wasted time: %v\n", i, linesCount, totalUrlsCount / ReportPeriod, successUrlsCount, failedUrlsCount, duration(start))
  366. totalUrlsCount = 0
  367. }
  368. }
  369. }()
  370.  
  371. for _, value := range fileLinesList {
  372. lines <- value
  373. }
  374.  
  375. go func() {
  376. defer close(lines)
  377. wg.Wait()
  378. }()
  379.  
  380. f := excelize.NewFile()
  381. // Create a new sheet.
  382. index := f.NewSheet("Sheet1")
  383. // Set value of a cell.
  384. style, _ := f.NewStyle(`{"fill":{"type":"pattern","color":["#cccccc"],"pattern":1},"font":{"bold":true},"alignment":{"horizontal":"center"}}`)
  385. _ = f.SetCellStyle("Sheet1", "A1", "L1", style)
  386. _ = f.SetColWidth("Sheet1", "A", "L", 30)
  387.  
  388. _ = f.SetCellValue("Sheet1", "A1", "URL")
  389. _ = f.SetCellValue("Sheet1", "B1", "HOST")
  390. _ = f.SetCellValue("Sheet1", "C1", "DOMAIN LEVEL")
  391. _ = f.SetCellValue("Sheet1", "D1", "LANG")
  392. _ = f.SetCellValue("Sheet1", "E1", "ROBOTS")
  393. _ = f.SetCellValue("Sheet1", "F1", "TITLE")
  394. _ = f.SetCellValue("Sheet1", "G1", "H1")
  395. _ = f.SetCellValue("Sheet1", "H1", "YANDEX X")
  396. _ = f.SetCellValue("Sheet1", "I1", "RECAPTCHA")
  397. _ = f.SetCellValue("Sheet1", "J1", "USER LOGIN")
  398. _ = f.SetCellValue("Sheet1", "K1", "USER PASSWORD")
  399.  
  400. for i2 := 0; i2 < len(fileLinksList); i2++ {
  401. _ = f.SetCellValue("Sheet1", toCharStr(i2+12)+"1", "HREF_REL "+strconv.Itoa(i2+1))
  402. _ = f.SetCellStyle("Sheet1", toCharStr(i2+12)+"1", toCharStr(i2+12)+"1", style)
  403. _ = f.SetColWidth("Sheet1", toCharStr(i2+12), toCharStr(i2+12), 30)
  404. }
  405.  
  406. for i := 0; i < len(endData); i++ {
  407. _ = f.SetCellValue("Sheet1", "A"+strconv.Itoa(i+2), endData[i]["url"])
  408. _ = f.SetCellValue("Sheet1", "B"+strconv.Itoa(i+2), endData[i]["host"])
  409. _ = f.SetCellValue("Sheet1", "C"+strconv.Itoa(i+2), endData[i]["domain_level"])
  410. _ = f.SetCellValue("Sheet1", "D"+strconv.Itoa(i+2), endData[i]["lang"])
  411. _ = f.SetCellValue("Sheet1", "E"+strconv.Itoa(i+2), endData[i]["meta_robots"])
  412. _ = f.SetCellValue("Sheet1", "F"+strconv.Itoa(i+2), endData[i]["title"])
  413. _ = f.SetCellValue("Sheet1", "G"+strconv.Itoa(i+2), endData[i]["h1"])
  414. _ = f.SetCellValue("Sheet1", "H"+strconv.Itoa(i+2), endData[i]["yandex_iks"])
  415. _ = f.SetCellValue("Sheet1", "I"+strconv.Itoa(i+2), endData[i]["recaptcha"])
  416. _ = f.SetCellValue("Sheet1", "J"+strconv.Itoa(i+2), endData[i]["login"])
  417. _ = f.SetCellValue("Sheet1", "K"+strconv.Itoa(i+2), endData[i]["password"])
  418. hrefRel := reflect.ValueOf(endData[i]["href_rel"])
  419. for h := 0; h < hrefRel.Len(); h++ {
  420. _ = f.SetCellValue("Sheet1", toCharStr(h+12)+strconv.Itoa(i+2), hrefRel.Index(h))
  421. }
  422.  
  423. }
  424.  
  425. // Set active sheet of the workbook.
  426. f.SetActiveSheet(index)
  427. // Save spreadsheet by the given path.
  428. if resultSaveFileError := f.SaveAs(FilesPath + "Result.xlsx"); resultSaveFileError != nil {
  429. panic(resultSaveFileError)
  430. }
  431.  
  432. fmt.Println()
  433. fmt.Println("Good urls: ", len(successUrls))
  434. fmt.Println("Bad urls: ", len(failedUrls))
  435. fmt.Println()
  436. fmt.Println("END!!! Work took", duration(start))
  437. fmt.Println()
  438.  
  439. }
  440.  
RAW Paste Data