Advertisement
RuneB

tweetParser

Feb 22nd, 2017
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 3.24 KB | None | 0 0
  1. import de.bezier.data.sql.*;
  2. import http.requests.*;
  3. import java.util.regex.*;
  4.  
  5. SQLite db;
  6. int imageIndex, userNameIndex;
  7. long id;
  8. String user;
  9. String notExisting = "";
  10. Pattern urlPattern;
  11. Matcher m;
  12. PrintWriter output;
  13. int count = 0;
  14. int tweetImgCount = 0;
  15. int notExistingCount = 0;
  16. int successCount = 0;
  17.  
  18. void setup() {
  19.   output = createWriter("tweetData.txt");
  20.   urlPattern = Pattern.compile(
  21.   "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)"
  22.     + "(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*"
  23.     + "[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
  24.   Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
  25.  
  26.   db = new SQLite( this, "../medtek2017kode/data/tweets1.db" );
  27.   if ( db.connect() ) {
  28.  
  29.     String Q = "select distinct id, user from tweet limit 10";
  30.     db.query(Q);
  31.     String urlMatcher;
  32.  
  33.     while (db.next ()) {
  34.       count++;
  35.       user = db.getString("user");
  36.       id = db.getLong("id");
  37.  
  38.       String imgUrl = null;
  39.       String imgContent = null;
  40.       String request = "https://twitter.com/"+user+"/status/"+id;
  41.       Boolean retweet = false;
  42.  
  43.       //Send request til twitter.com/{<brugernavn>}/status/{<id>}
  44.       GetRequest get = new GetRequest(request);
  45.       get.send();
  46.  
  47.       imageIndex = get.getContent().indexOf("<div class=\"AdaptiveMedia-photoContainer js-adaptive-photo");
  48.       userNameIndex = get.getContent().indexOf("data-screen-name=\"");  
  49.  
  50.       if (imageIndex != -1) {
  51.         tweetImgCount++;
  52.         imgContent = get.getContent().substring(imageIndex, imageIndex+200);
  53.         //println(imgContent);
  54.         m = urlPattern.matcher(imgContent);
  55.         while (m.find ()) {
  56.           int matchStart = m.start(1);
  57.           int matchEnd = m.end();
  58.           imgUrl = imgContent.substring(matchStart, matchEnd);
  59.         }
  60.       }
  61.       print("---------------\nAll Count:", count, "With img:", tweetImgCount, " | ");
  62.       if (userNameIndex != -1) {
  63.         imgContent = get.getContent().substring(userNameIndex, userNameIndex+200);
  64.         if (!imgContent.contains(user))
  65.           retweet = true;
  66.  
  67.         println(user, id);
  68.         println("Request:", request);
  69.         if (imgUrl != null)
  70.           println("ImgUrl: ", imgUrl, "\n"+"Retweet/response:", retweet);
  71.       } else {
  72.         notExisting += "("+user+", "+id+ "), ";
  73.         println("\n"+"Not existing:", ++notExistingCount);
  74.       }
  75.       if (imageIndex != -1 && userNameIndex != -1 && !retweet) {
  76.         println("Added:", ++successCount);
  77.         output.println("INSERT INTO image VALUES ("+ id +", '"+imgUrl+"');");
  78.       } else println();
  79.     }
  80.  
  81.     //println("User "+count+": "+user+"\n    Profile img: "+ ppUrl + "\n    Tweets: " +tweets + "\n    Following: " +following + "\n    Followers: " +followers  + "\n    Likes: " +likes  + "\n    Description: " +description);
  82.     //println("Not existing: "+ notExisting +"\n");
  83.   }
  84.   output.flush(); // Writes the remaining data to the file
  85.   output.close();
  86.   println("\n"+"Not existing: ", notExisting);
  87.  
  88.   int x = millis()/1000;
  89.   int seconds = x % 60;
  90.   x /= 60;
  91.   int minutes = x % 60;
  92.   x /= 60;
  93.   int hours = x % 24;
  94.   x /= 24;
  95.   int days = x;
  96.   println("Done in", x, "days", hours, "hours", minutes, "minutes and", seconds, "seconds"  );
  97.   exit();
  98. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement