Advertisement
Guest User

Untitled

a guest
Oct 28th, 2016
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Go 9.08 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4.     "os"
  5.     "fmt"
  6.     "time"
  7.     "regexp"
  8.     "net/http"
  9.     "database/sql"
  10.     //"encoding/json"
  11.     "golang.org/x/net/html"
  12.     "github.com/nu7hatch/gouuid"
  13.     _ "github.com/go-sql-driver/mysql"
  14. )
  15.  
  16. // Global start time used to time the paplications runtime
  17. var startTimer time.Time
  18. var domain string
  19.  
  20. func main() {
  21.  
  22.     domain = os.Args[1]
  23.  
  24.     response := getWebsiteContent(&domain);
  25.     z := html.NewTokenizer(response.Body)
  26.     inElement := false
  27.  
  28.     title := ""
  29.     link := ""
  30.     content := ""
  31.  
  32.     for {
  33.         tt := z.Next()
  34.  
  35.         switch {
  36.             // StartTagToken == The start of a DOM object
  37.             case tt == html.StartTagToken:
  38.  
  39.                 t := z.Token()
  40.  
  41.                 // Check if the token is a hyperlink
  42.                 elementMatch := t.Data == "a"
  43.                 if elementMatch {
  44.  
  45.                     // If the Token is a StartTagToken of type a, we clear the title and link
  46.                     title = ""
  47.                     link = ""
  48.  
  49.                     inElement = true
  50.  
  51.                     // Scan the tag for title and href                
  52.                     for _, attr := range t.Attr {
  53.  
  54.                         if attr.Key == "title" {
  55.                              title = attr.Val
  56.                         }
  57.  
  58.                         if attr.Key == "href" {
  59.                              link = attr.Val
  60.                         }
  61.                     }
  62.                 }
  63.  
  64.             // TextToken == The content within the DOM element
  65.             case tt == html.TextToken:
  66.  
  67.                 if inElement {
  68.  
  69.                     // if we enter a text token we clear the content field
  70.                     content = ""
  71.  
  72.                     rawBytes := z.Text()
  73.                     byteLength := len(rawBytes)
  74.  
  75.                     // if there is only one letter in this text block and it's a "new line"
  76.                     if byteLength == 1 && rawBytes[0] == 10{
  77.                         continue;
  78.                     }
  79.  
  80.                     // Only do stuff if there is actually some bytes in the text block
  81.                     if byteLength > 0 {
  82.  
  83.                         // strip new line characters from text
  84.                         for i, bytes := range rawBytes {
  85.                             if bytes == 10{
  86.                                 rawBytes[i] = 0
  87.                             }
  88.                         }
  89.  
  90.                        content = string(rawBytes)
  91.                     }
  92.                 }
  93.  
  94.             // EndTagToken == The end of a DOM object
  95.             case tt == html.EndTagToken:
  96.  
  97.                 t := z.Token()
  98.  
  99.                 // check if the token is a hyperlink
  100.                 elementMatch := t.Data == "a"
  101.                 if elementMatch {
  102.                     inElement = false
  103.                     // once we are sure we are exiting a hyperlink, we process the element
  104.                     //go processElement(title, link, content)
  105.                     // using non-async version for output testing
  106.                     processElement(title, link, content, domain)
  107.                 }
  108.  
  109.             case tt == html.ErrorToken:
  110.                 // the ErrorToken signals the the end of the document
  111.                 stopProcessing()
  112.                 return
  113.         }
  114.  
  115.     }
  116.  
  117. }
  118.  
  119. // We do not pass data by reference because this is ran a-sync from everything else
  120. func processElement(title string, link string, content string, domain string){
  121.  
  122.     shouldSave := matchLinkToTags(&title, &link, &content)
  123.     shouldNotSave := matchLinkToBadTags(&title, &link, &content)
  124.  
  125.     if shouldSave == true && shouldNotSave == false {
  126.  
  127.         // we need to do this because the content can be a long string, not sutable to be stored in a varchar
  128.         // this way we can't set unique
  129.         lookupTable := "content"
  130.         _,failed,_ := getLink(&lookupTable, &content)
  131.  
  132.         if failed {
  133.  
  134.             msg, err := saveLink(&link, &title, &content, &domain)
  135.             if err == false {
  136.                 // turn into logging ?
  137.                 fmt.Println("Failed saving link: ", link, " /// error: ", msg)
  138.             }
  139.  
  140.             if err != false {
  141.                 // turn into logging ?
  142.                 fmt.Println("Link: " , link, " /// Title: ", title, " /// Content: ", content)
  143.             }
  144.         }
  145.     }
  146. }
  147.  
  148. func matchLinkToTags(title *string, link *string, content *string) bool{
  149.     words := []string{"Poke","Poké","Pokemon","Pokémon","pokemon-go","Pokemon GO","Pokémon GO","pokémon-go"}
  150.  
  151.     for _, word := range words {
  152.         matchTitle := regx(title, &word)
  153.         matchContent := regx(content, &word)
  154.         matchLink := regx(link, &word)
  155.  
  156.         if matchTitle == true || matchContent == true || matchLink == true {
  157.             return true
  158.         }
  159.     }
  160.  
  161.     return false
  162. }
  163.  
  164. func matchLinkToBadTags(title *string, link *string, content *string) bool{
  165.     words := []string{"facebook.com","twitter.com","linkedin.com","plusone.google.com", "Share on LinkedIn"}
  166.  
  167.     for _, word := range words {
  168.         matchTitle := regx(title, &word)
  169.         matchContent := regx(content, &word)
  170.         matchLink := regx(link, &word)
  171.  
  172.         if matchTitle == true || matchContent == true || matchLink == true {
  173.             return true
  174.         }
  175.     }
  176.  
  177.     return false
  178. }
  179.  
  180. // regexp check functions
  181. func regx(source *string, pattern *string) bool {
  182.  
  183.     matched, err := regexp.MatchString(*pattern, *source)
  184.     if err != nil {
  185.         // if we crap out, we return false
  186.         return false
  187.     }
  188.  
  189.     // matched is either true or false depending on if we matched our pattern
  190.     return matched
  191.  
  192. }
  193.  
  194. func getWebsiteContent(link *string) *http.Response{
  195.     startTimer = time.Now()
  196.     fmt.Println("Started processing: ", *link)
  197.  
  198.     response, err := http.Get(*link)
  199.  
  200.     if err != nil {
  201.         fmt.Println(err.Error())
  202.         panic(err)
  203.     }
  204.  
  205.     return response
  206. }
  207.  
  208. func stopProcessing(){
  209.     elapsed := time.Since(startTimer)
  210.     fmt.Println("Finished!.. processing time ( ", elapsed, " )")
  211. }
  212.  
  213.  
  214. //==================================
  215. // Link
  216. //==================================
  217. type Link struct {
  218.     Id          string `db:"id",        json:"id",          default:""`
  219.     Link        string `db:"link",      json:"link",        default:""`
  220.     Title       string `db:"title",     json:"title",       default:""`
  221.     Content     string `db:"content",   json:"content",     default:""`
  222.     Domain      string `db:"domain",    json:"domain",      default:""`
  223. }
  224.  
  225. //==================================
  226. // CONNECT TO DB AND RETURN THE CONNECTION
  227. // https://github.com/go-sql-driver/mysql/wiki/Examples
  228. //==================================
  229. func connectToSQL() *sql.DB {
  230.  
  231.     databaseName := "godb"
  232.     host := "localhost"
  233.     port := "3306"
  234.     username := "homestead"
  235.     password := "secret"
  236.     connectionType := "mysql"
  237.     networkType := "tcp"
  238.     charset := "utf8"
  239.  
  240.     db, _ := sql.Open(connectionType, username + ":" + password + "@"+networkType+"("+host+":"+port+")/"+databaseName+"?charset="+charset)
  241.     return db
  242. }
  243.  
  244. //==================================
  245. // GET A LINK BY COLUMN
  246. //==================================
  247. func getLink(column *string, value *string) (*Link, bool, string) {
  248.  
  249.     errorMessage := ""
  250.     failed := false
  251.     // Escape any harmfull characters
  252.     cleanValue := html.EscapeString(*value)
  253.     cleanColumn := html.EscapeString(*column)
  254.  
  255.     // Set a pointer to our user struct
  256.     link := &Link{}
  257.  
  258.     db := connectToSQL()
  259.     defer db.Close()
  260.  
  261.     statement, err := db.Prepare("SELECT * FROM link WHERE " + cleanColumn + "=?")
  262.     if err != nil {
  263.         return link, true, "Error on line 203: " + err.Error()
  264.     }
  265.     defer statement.Close()
  266.  
  267.     // this statement does not return anything other then an error, it will set data by pointers.
  268.     err = statement.QueryRow(cleanValue).Scan(&link.Id, &link.Link, &link.Title, &link.Content)
  269.  
  270.     // Detect errors
  271.     switch err {
  272.         case nil:
  273.         case sql.ErrNoRows:
  274.             // I tried no content as an error too, this way it's easier to check for on the other end.
  275.             failed = true
  276.             errorMessage = "No Content"
  277.         default:
  278.             failed = true
  279.             errorMessage = "Failed query: SELECT * FROM link WHERE " + cleanColumn + "=" + cleanValue + " // with error" + err.Error()
  280.     }
  281.  
  282.     return link, failed, errorMessage
  283. }
  284.  
  285. // is saving but sending
  286. func saveLink(link *string, title *string, content *string, domain *string) (string, bool){
  287.  
  288.     db := connectToSQL()
  289.     defer db.Close()
  290.  
  291.      // Prepare statement for inserting data
  292.     stmtIns, err1 := db.Prepare("INSERT INTO link VALUES( ?, ?, ?, ?, ? )")
  293.     if err1 != nil {
  294.         return err1.Error(), false
  295.     }
  296.     defer stmtIns.Close()
  297.  
  298.     // make uuid
  299.     u, err2 := uuid.NewV4()
  300.     if err2 != nil {
  301.         return err2.Error(), false
  302.     }
  303.  
  304.     _, err3 := stmtIns.Exec(u.String(), *link, *title, *content, *domain)
  305.     if err3 != nil {
  306.         return err3.Error(), false
  307.     }
  308.  
  309.     return "Success",true
  310. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement