Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package main
- import (
- "os"
- "fmt"
- "time"
- "regexp"
- "net/http"
- "database/sql"
- //"encoding/json"
- "golang.org/x/net/html"
- "github.com/nu7hatch/gouuid"
- _ "github.com/go-sql-driver/mysql"
- )
- // Global start time used to time the paplications runtime
- var startTimer time.Time
- var domain string
- func main() {
- domain = os.Args[1]
- response := getWebsiteContent(&domain);
- z := html.NewTokenizer(response.Body)
- inElement := false
- title := ""
- link := ""
- content := ""
- for {
- tt := z.Next()
- switch {
- // StartTagToken == The start of a DOM object
- case tt == html.StartTagToken:
- t := z.Token()
- // Check if the token is a hyperlink
- elementMatch := t.Data == "a"
- if elementMatch {
- // If the Token is a StartTagToken of type a, we clear the title and link
- title = ""
- link = ""
- inElement = true
- // Scan the tag for title and href
- for _, attr := range t.Attr {
- if attr.Key == "title" {
- title = attr.Val
- }
- if attr.Key == "href" {
- link = attr.Val
- }
- }
- }
- // TextToken == The content within the DOM element
- case tt == html.TextToken:
- if inElement {
- // if we enter a text token we clear the content field
- content = ""
- rawBytes := z.Text()
- byteLength := len(rawBytes)
- // if there is only one letter in this text block and it's a "new line"
- if byteLength == 1 && rawBytes[0] == 10{
- continue;
- }
- // Only do stuff if there is actually some bytes in the text block
- if byteLength > 0 {
- // strip new line characters from text
- for i, bytes := range rawBytes {
- if bytes == 10{
- rawBytes[i] = 0
- }
- }
- content = string(rawBytes)
- }
- }
- // EndTagToken == The end of a DOM object
- case tt == html.EndTagToken:
- t := z.Token()
- // check if the token is a hyperlink
- elementMatch := t.Data == "a"
- if elementMatch {
- inElement = false
- // once we are sure we are exiting a hyperlink, we process the element
- //go processElement(title, link, content)
- // using non-async version for output testing
- processElement(title, link, content, domain)
- }
- case tt == html.ErrorToken:
- // the ErrorToken signals the the end of the document
- stopProcessing()
- return
- }
- }
- }
- // We do not pass data by reference because this is ran a-sync from everything else
- func processElement(title string, link string, content string, domain string){
- shouldSave := matchLinkToTags(&title, &link, &content)
- shouldNotSave := matchLinkToBadTags(&title, &link, &content)
- if shouldSave == true && shouldNotSave == false {
- // we need to do this because the content can be a long string, not sutable to be stored in a varchar
- // this way we can't set unique
- lookupTable := "content"
- _,failed,_ := getLink(&lookupTable, &content)
- if failed {
- msg, err := saveLink(&link, &title, &content, &domain)
- if err == false {
- // turn into logging ?
- fmt.Println("Failed saving link: ", link, " /// error: ", msg)
- }
- if err != false {
- // turn into logging ?
- fmt.Println("Link: " , link, " /// Title: ", title, " /// Content: ", content)
- }
- }
- }
- }
- func matchLinkToTags(title *string, link *string, content *string) bool{
- words := []string{"Poke","Poké","Pokemon","Pokémon","pokemon-go","Pokemon GO","Pokémon GO","pokémon-go"}
- for _, word := range words {
- matchTitle := regx(title, &word)
- matchContent := regx(content, &word)
- matchLink := regx(link, &word)
- if matchTitle == true || matchContent == true || matchLink == true {
- return true
- }
- }
- return false
- }
- func matchLinkToBadTags(title *string, link *string, content *string) bool{
- words := []string{"facebook.com","twitter.com","linkedin.com","plusone.google.com", "Share on LinkedIn"}
- for _, word := range words {
- matchTitle := regx(title, &word)
- matchContent := regx(content, &word)
- matchLink := regx(link, &word)
- if matchTitle == true || matchContent == true || matchLink == true {
- return true
- }
- }
- return false
- }
- // regexp check functions
- func regx(source *string, pattern *string) bool {
- matched, err := regexp.MatchString(*pattern, *source)
- if err != nil {
- // if we crap out, we return false
- return false
- }
- // matched is either true or false depending on if we matched our pattern
- return matched
- }
- func getWebsiteContent(link *string) *http.Response{
- startTimer = time.Now()
- fmt.Println("Started processing: ", *link)
- response, err := http.Get(*link)
- if err != nil {
- fmt.Println(err.Error())
- panic(err)
- }
- return response
- }
- func stopProcessing(){
- elapsed := time.Since(startTimer)
- fmt.Println("Finished!.. processing time ( ", elapsed, " )")
- }
- //==================================
- // Link
- //==================================
- type Link struct {
- Id string `db:"id", json:"id", default:""`
- Link string `db:"link", json:"link", default:""`
- Title string `db:"title", json:"title", default:""`
- Content string `db:"content", json:"content", default:""`
- Domain string `db:"domain", json:"domain", default:""`
- }
- //==================================
- // CONNECT TO DB AND RETURN THE CONNECTION
- // https://github.com/go-sql-driver/mysql/wiki/Examples
- //==================================
- func connectToSQL() *sql.DB {
- databaseName := "godb"
- host := "localhost"
- port := "3306"
- username := "homestead"
- password := "secret"
- connectionType := "mysql"
- networkType := "tcp"
- charset := "utf8"
- db, _ := sql.Open(connectionType, username + ":" + password + "@"+networkType+"("+host+":"+port+")/"+databaseName+"?charset="+charset)
- return db
- }
- //==================================
- // GET A LINK BY COLUMN
- //==================================
- func getLink(column *string, value *string) (*Link, bool, string) {
- errorMessage := ""
- failed := false
- // Escape any harmfull characters
- cleanValue := html.EscapeString(*value)
- cleanColumn := html.EscapeString(*column)
- // Set a pointer to our user struct
- link := &Link{}
- db := connectToSQL()
- defer db.Close()
- statement, err := db.Prepare("SELECT * FROM link WHERE " + cleanColumn + "=?")
- if err != nil {
- return link, true, "Error on line 203: " + err.Error()
- }
- defer statement.Close()
- // this statement does not return anything other then an error, it will set data by pointers.
- err = statement.QueryRow(cleanValue).Scan(&link.Id, &link.Link, &link.Title, &link.Content)
- // Detect errors
- switch err {
- case nil:
- case sql.ErrNoRows:
- // I tried no content as an error too, this way it's easier to check for on the other end.
- failed = true
- errorMessage = "No Content"
- default:
- failed = true
- errorMessage = "Failed query: SELECT * FROM link WHERE " + cleanColumn + "=" + cleanValue + " // with error" + err.Error()
- }
- return link, failed, errorMessage
- }
- // is saving but sending
- func saveLink(link *string, title *string, content *string, domain *string) (string, bool){
- db := connectToSQL()
- defer db.Close()
- // Prepare statement for inserting data
- stmtIns, err1 := db.Prepare("INSERT INTO link VALUES( ?, ?, ?, ?, ? )")
- if err1 != nil {
- return err1.Error(), false
- }
- defer stmtIns.Close()
- // make uuid
- u, err2 := uuid.NewV4()
- if err2 != nil {
- return err2.Error(), false
- }
- _, err3 := stmtIns.Exec(u.String(), *link, *title, *content, *domain)
- if err3 != nil {
- return err3.Error(), false
- }
- return "Success",true
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement