Advertisement
RuneB

twitterHtmlUserScraper

Feb 21st, 2017
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 6.02 KB | None | 0 0
  1. import de.bezier.data.sql.*;
  2. import http.requests.*;
  3. import java.util.regex.*;
  4.  
  5. SQLite db;
  6. int profilePicIndex, tweetIndex, followingIndex, followersIndex, likesIndex, descriptionIndex;
  7. String imgContent, ppUrl, user, tweets, following, followers, likes, description, notExisting;
  8. Pattern urlPattern;
  9. Matcher m;
  10. PrintWriter output;
  11. int count = 1;
  12.  
  13. void setup() {
  14.   output = createWriter("userData.txt");
  15.   notExisting = "";
  16.   //En regex til at matche url'er
  17.   urlPattern = Pattern.compile(
  18.   "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)"
  19.     + "(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*"
  20.     + "[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
  21.   Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
  22.  
  23.   db = new SQLite( this, "../medtek2017kode/data/tweets1.db" );
  24.   if ( db.connect() ) {
  25.  
  26.     String Q = "select distinct user from user";
  27.     db.query(Q);
  28.     String urlMatcher;
  29.  
  30.     while (db.next ()) {
  31.       count++;
  32.       user = db.getString("user");
  33.       print(user, " | ");
  34.  
  35.       //request til twitter.com/<brugernavn>
  36.       GetRequest get = new GetRequest("https://twitter.com/"+user);
  37.       get.send();
  38.  
  39.       //PROFILE PICTURE
  40.       profilePicIndex = get.getContent().indexOf("<a class=\"ProfileAvatar-container u-block js-tooltip profile-picture\"");  
  41.       if (profilePicIndex < 0) {
  42.         notExisting += user+", ";
  43.         println();
  44.         continue;
  45.       }
  46.  
  47.       imgContent = get.getContent().substring(profilePicIndex, profilePicIndex+1000);
  48.  
  49.       //Java's pattern og matcher klasser bruges til at klippe billedets url ud af html indholdet.
  50.       m = urlPattern.matcher(imgContent);
  51.       while (m.find ()) {
  52.         int matchStart = m.start(1);
  53.         int matchEnd = m.end();
  54.         ppUrl = imgContent.substring(matchStart, matchEnd);
  55.       }
  56.  
  57.       //TWEETS
  58.       tweetIndex = get.getContent().indexOf("<li class=\"ProfileNav-item ProfileNav-item--tweets is-active\">");
  59.       tweets = (tweetIndex > -1) ? get.getContent().substring(tweetIndex+178, tweetIndex+211).replaceAll("[^0-9.]", "") : "0";
  60.  
  61.       //FOLLOWING      
  62.       followingIndex = get.getContent().indexOf("<li class=\"ProfileNav-item ProfileNav-item--following\">");
  63.       following = (followingIndex > -1) ? get.getContent().substring(followingIndex+214, followingIndex+224).replaceAll("[^0-9.]", "") : "0";
  64.  
  65.       //FOLLOWERS    
  66.       followersIndex = get.getContent().indexOf("<li class=\"ProfileNav-item ProfileNav-item--followers\">");
  67.       followers = (followersIndex > -1) ? get.getContent().substring(followersIndex+214, followersIndex+224).replaceAll("[^0-9.]", "") : "0";
  68.  
  69.       //LIKES    
  70.       likesIndex = get.getContent().indexOf("<li class=\"ProfileNav-item ProfileNav-item--favorites\" data-more-item=\".ProfileNav-dropdownItem--favorites\">");
  71.       likes = (likesIndex > -1) ? get.getContent().substring(likesIndex+267, likesIndex+275).replaceAll("[^0-9.]", "") : "0";
  72.  
  73.       //DESCRIPTION    
  74.       descriptionIndex = get.getContent().indexOf("<p class=\"ProfileHeaderCard-bio");
  75.       description = (descriptionIndex > -1) ? get.getContent().substring(descriptionIndex+62, descriptionIndex+2000) : null;
  76.       description = description.substring(0, description.indexOf("</p>"));
  77.  
  78.       //<a href="/hashtag/dkpol?src=hash" data-query-source="hashtag_click" class="twitter-hashtag pretty-link js-nav" dir="ltr" ><s>#</s><b>dkpol</b></a>
  79.       //FJERN STARTEN Pƅ HASHTAG HVIS != NULL
  80.       description = (description.length() < 2) ? null : "'" + description.replaceAll("<a href=\"/hashtag/", "#").replaceAll("<a href=\"/", "@").replaceAll("<a href=\"", "") + "'";
  81.       String temp = "";
  82.       if (description != null) {
  83.         StringBuffer s = new StringBuffer(description);
  84.  
  85.  
  86.         //while (s.indexOf ("?src=hash") > 0) {
  87.         //  s.replace(s.indexOf("?src=hash"), s.indexOf("?src=hash")+125, " ");
  88.         //}
  89.         //OMKSKRIV LINKS
  90.         while (s.indexOf ("rel=\"nofollow noopener\" dir=\"ltr") > 0) {
  91.           s.replace(s.indexOf("rel=\"nofollow noopener\" dir=\"ltr")-2, s.indexOf("rel=\"nofollow noopener\" dir=\"ltr")+331, " ");
  92.         }
  93.  
  94.         //OMSKRIV LINKS TIL BRUGERE
  95.         if (s.indexOf ("\" class=\"tweet-url twitter-atreply pretty-link\"") > 0) {
  96.           String[][] matches = matchAll(s.toString(), "<s>@</s><b>(.*?)</b></a>");
  97.           println();
  98.           for (int i = 0; i < matches.length; i++) {
  99.             println("Match: "+matches[i][0]);
  100.             description = (s.substring(0, s.indexOf("\" class=\"tweet-url twitter-atreply pretty-link\""))+s.substring(s.indexOf(matches[i][0])+matches[i][0].length()));
  101.             s = new StringBuffer(description);
  102.           }
  103.         }
  104.  
  105.         //FJERN EMOJI
  106.         while (s.indexOf ("<img class=\"Emoji Emoji--forText\"") > 0) {
  107.           description = (s.substring(0, s.indexOf("<img class=\"Emoji Emoji--forText\""))+s.substring(s.indexOf("Emoji:")+7));
  108.           s = new StringBuffer(description.replace("\">", ""));
  109.         }
  110.  
  111.         //OMKSKRIV RESTEN AF HASHTAGS
  112.         while (s.indexOf ("?src=hash") > 0) {
  113.           String[][] matches = matchAll(s.toString(), "<s>#</s><b>(.*?)</b></a>");
  114.           println();
  115.           for (int i = 0; i < matches.length; i++) {
  116.             println("Match: "+matches[i][0]);
  117.             description = (s.substring(0, s.indexOf("?src=hash"))+s.substring(s.indexOf(matches[i][0])+matches[i][0].length()));
  118.             s = new StringBuffer(description);
  119.           }
  120.         }
  121.         description = s.toString().replace("  ", " ");
  122.       }
  123.  
  124.  
  125.       println("User "+count+": "+user+"\n    Profile img: "+ ppUrl + "\n    Tweets: " +tweets + "\n    Following: " +following + "\n    Followers: " +followers  + "\n    Likes: " +likes  + "\n    Description: " +description);
  126.       println("Not existing: "+ notExisting +"\n");
  127.       output.println("INSERT INTO user1 VALUES ('"+ user +"', "+tweets+", "+following+", "+followers+", "+ likes+", "+description+", '"+ppUrl+"');");
  128.     }
  129.     output.flush(); // Writes the remaining data to the file
  130.     output.close();
  131.     exit();
  132.   }
  133. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement