Advertisement
Guest User

friendster-scrape-profile

a guest
May 5th, 2011
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 10.99 KB | None | 0 0
  1. #!/bin/bash
  2. #
  3. # Download a Friendster profile.
  4. # ./friendster-scrape-profile PROFILE_ID
  5. #
  6. # Currently downloads:
  7. #  - the main profile page (profiles.friendster.com/$PROFILE_ID)
  8. #  - the user's profile image from that page
  9. #  - the list of public albums (www.friendster.com/viewalbums.php?uid=$PROFILE_ID)
  10. #  - each of the album pages (www.friendster.com/viewphotos.php?a=$id&uid=$PROFILE_ID)
  11. #  - the original photos from each album
  12. #  - the list of friends (www.friendster.com/fans.php?uid=$PROFILE_ID)
  13. #  - the other list of friends (www.friendster.com/fans.php?action=spusers&uid=$PROFILE_ID)
  14. #  - the Friendster blog, if any
  15. #
  16. # Does not currently download anything else (such as the widgets on the profile page).
  17. #
  18. #
  19. # BEFORE USE: enter your Friendster account data in username.txt and password.txt
  20. #
  21. #
  22.  
  23. PROFILE_ID=$1
  24.  
  25. USERNAME=`cat username.txt`
  26. PASSWORD=`cat password.txt`
  27. # trim whitespace
  28. USERNAME=${USERNAME/ /}
  29. PASSWORD=${PASSWORD/ /}
  30.  
  31.  
  32. if [[ ! $USERNAME =~ @ ]]
  33. then
  34.   echo "Enter your username (your Friendster email) in username.txt and your password in password.txt."
  35.   exit 3
  36. fi
  37.  
  38. # check the id
  39. if [[ ! $PROFILE_ID =~ ^[0-9]+$ ]]
  40. then
  41.   echo "No profile id given."
  42.   exit 1
  43. fi
  44.  
  45.  
  46. START=$(date +%s)
  47.  
  48. # build directory name
  49. PROFILE_ID_WITH_PREFIX=$PROFILE_ID
  50. while [[ ${#PROFILE_ID_WITH_PREFIX} -lt 3 ]]
  51. do
  52.   # id too short, prefix with 0
  53.   PROFILE_ID_WITH_PREFIX=0$PROFILE_ID_WITH_PREFIX
  54. done
  55. PROFILE_DIR=data/${PROFILE_ID_WITH_PREFIX:0:1}/${PROFILE_ID_WITH_PREFIX:1:1}/${PROFILE_ID_WITH_PREFIX:2:1}/$PROFILE_ID
  56.  
  57.  
  58. USER_AGENT="Googlebot/2.1 (+http://www.googlebot.com/bot.html)"
  59. WGET="wget --no-clobber -nv -a $PROFILE_DIR/wget.log"
  60.  
  61.  
  62. # user should not exist
  63. if [ -d $PROFILE_DIR ]
  64. then
  65.   echo "Profile directory $PROFILE_DIR already exists. Not downloading."
  66.   exit 2
  67. fi
  68.  
  69.  
  70. echo "Downloading $PROFILE_ID:"
  71.  
  72. # make directories
  73. mkdir -p $PROFILE_DIR
  74. mkdir -p $PROFILE_DIR/photos
  75.  
  76.  
  77. # make sure the cookies file exists (may be empty)
  78. touch cookies.txt
  79.  
  80.  
  81. # download profile page
  82. echo " - profile page"
  83. # reuse the session cookies, if there are any
  84. $WGET -U "$USER_AGENT" --keep-session-cookies --save-cookies cookies.txt --load-cookies cookies.txt -O $PROFILE_DIR/profile.html "http://profiles.friendster.com/$PROFILE_ID"
  85.  
  86.  
  87. # check if we are logged in, if not: do so
  88. if ! grep -q "View, edit or update your profile" $PROFILE_DIR/profile.html
  89. then
  90.   echo "Logging in..."
  91.   rm -f cookies.txt
  92.   $WGET -U "$USER_AGENT" http://www.friendster.com/login.php --max-redirect=0 --keep-session-cookies --save-cookies cookies.txt --load-cookies cookies.txt --post-data="_submitted=1&next=/&tzoffset=-120&email=$USERNAME&password=$PASSWORD"
  93. fi
  94.  
  95.  
  96. # is this profile available?
  97. if grep -q "This user's profile is not available." $PROFILE_DIR/profile.html
  98. then
  99.   echo "  Not available."
  100.   exit 5
  101. fi
  102.  
  103.  
  104. # extract profile url (with username)
  105. profile_url=`cat $PROFILE_DIR/profile.html | grep -o -E "URL: </span><p><a href=\"http://profiles.friendster.com/.+\">http" | grep -o -E "http://profiles.friendster.com/[^\"]+"`
  106. if [[ "$profile_url" =~ http:// ]]
  107. then
  108.   echo $profile_url > $PROFILE_DIR/profile_url.txt
  109. fi
  110.  
  111. # extract blog url
  112. blog_url=`cat $PROFILE_DIR/profile.html | grep -o -E "http://[^\"]+\.blogs?\.friendster\.com/" | uniq`
  113. if [[ "$blog_url" =~ http:// ]]
  114. then
  115.   echo $blog_url > $PROFILE_DIR/blog_url.txt
  116. fi
  117.  
  118. # download profile image
  119. echo " - profile photo"
  120. profile_photo_url=`grep -E "imgblock200.+img src=\".+m\.jpg\"" $PROFILE_DIR/profile.html | grep -o -E "src=\"http.+\.jpg" | grep -o -E "http.+"`
  121. if [[ "$profile_photo_url" =~ "http://" ]]
  122. then
  123.   # url for original size
  124.   photo_url_orig=${profile_photo_url/m.jpg/.jpg}
  125.   # extract photo id
  126.   photo_id=`expr "$profile_photo_url" : '.\+/photos/\(.\+\)m.jpg'`
  127.   mkdir -p $PROFILE_DIR/photos/`dirname $photo_id`
  128.  
  129.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/photos/$photo_id.jpg "$photo_url_orig"
  130.  
  131.   cp $PROFILE_DIR/photos/$photo_id.jpg $PROFILE_DIR/profile_photo.jpg
  132. fi
  133.  
  134. # download albums page
  135. page=0
  136. max_page=0
  137. while [[ $page -le $max_page ]]
  138. do
  139.   echo " - albums index, page $page"
  140.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/albums_${page}.html "http://www.friendster.com/viewalbums.php?uid=$PROFILE_ID&page=${page}"
  141.  
  142.   # get page links
  143.   page_numbers=`grep -o -E "/viewalbums.php\?page=[0-9]+" $PROFILE_DIR/albums_${page}.html | grep -o -E "[0-9]+"`
  144.   # update max page number
  145.   for new_page_num in $page_numbers
  146.   do
  147.     if [[ $max_page -lt $new_page_num ]]
  148.     then
  149.       max_page=$new_page_num
  150.     fi
  151.   done
  152.  
  153.   # next page
  154.   let "page = $page + 1"
  155. done
  156.  
  157. # find album ids
  158. ALBUM_IDS=`grep -o -E "/viewphotos\.php\?a=[0-9]+&amp;uid=" $PROFILE_DIR/albums_*.html | grep -o -E "[0-9]+" | sort | uniq`
  159. for id in $ALBUM_IDS
  160. do
  161.   page=0
  162.   max_page=0
  163.  
  164.   while [[ $page -le $max_page ]]
  165.   do
  166.     echo " - album $id, page $page"
  167.     # download album page
  168.     $WGET -U "$USER_AGENT" -O $PROFILE_DIR/photos_${id}_${page}.html "http://www.friendster.com/viewphotos.php?a=$id&uid=$PROFILE_ID&page=${page}"
  169.  
  170.     # get page links
  171.     page_numbers=`grep -o -E "/viewphotos.php\?page=[0-9]+" $PROFILE_DIR/photos_${id}_${page}.html | grep -o -E "[0-9]+"`
  172.     # update max page number
  173.     for new_page_num in $page_numbers
  174.     do
  175.       if [[ $max_page -lt $new_page_num ]]
  176.       then
  177.         max_page=$new_page_num
  178.       fi
  179.     done
  180.  
  181.     # get photo urls
  182.     PHOTO_URLS=`grep -o -E "http://photos.+friendster\.com/photos/.+m\.jpg" $PROFILE_DIR/photos_${id}_${page}.html | sort | uniq`
  183.  
  184.     # download photos
  185.     for photo_url in $PHOTO_URLS
  186.     do
  187.       # url for original size
  188.       photo_url_orig=${photo_url/m.jpg/.jpg}
  189.       # extract photo id
  190.       photo_id=`expr "$photo_url" : '.\+/photos/\(.\+\)m.jpg'`
  191.       mkdir -p $PROFILE_DIR/photos/`dirname $photo_id`
  192.  
  193.       $WGET -U "$USER_AGENT" -O $PROFILE_DIR/photos/$photo_id.jpg "$photo_url_orig"
  194.     done
  195.  
  196.     # next page
  197.     let "page = $page + 1"
  198.   done
  199. done
  200.  
  201. # download 'friends' page(s)
  202. page=0
  203. max_page=0
  204. while [[ $page -le $max_page ]]
  205. do
  206.   echo " - friends page $page"
  207.   # download page
  208.   $WGET -U "$USER_AGENT" --max-redirect=0 -O $PROFILE_DIR/friends_${page}.html "http://www.friendster.com/friends.php?uid=$PROFILE_ID&page=${page}"
  209.  
  210.   # get page links
  211.   page_numbers=`grep -o -E "/friends/$PROFILE_ID/[0-9]+\"" $PROFILE_DIR/friends_${page}.html | grep -o -E "[0-9]+\"" | grep -o -E "[0-9]+"`
  212.   # update max page number
  213.   for new_page_num in $page_numbers
  214.   do
  215.     if [[ $max_page -lt $new_page_num ]]
  216.     then
  217.       max_page=$new_page_num
  218.     fi
  219.   done
  220.  
  221.   let "page = $page + 1"
  222. done
  223.  
  224. # download 'fans' page(s)
  225. page=0
  226. max_page=0
  227. while [[ $page -le $max_page ]]
  228. do
  229.   echo " - fans page $page"
  230.   # download page
  231.   $WGET -U "$USER_AGENT" --max-redirect=0 -O $PROFILE_DIR/fans_${page}.html "http://www.friendster.com/fans.php?uid=$PROFILE_ID&page=${page}"
  232.  
  233.   # get page links
  234.   page_numbers=`grep -o -E "/fans/$PROFILE_ID/[0-9]+\"" $PROFILE_DIR/fans_${page}.html | grep -o -E "[0-9]+\"" | grep -o -E "[0-9]+"`
  235.   # update max page number
  236.   for new_page_num in $page_numbers
  237.   do
  238.     if [[ $max_page -lt $new_page_num ]]
  239.     then
  240.       max_page=$new_page_num
  241.     fi
  242.   done
  243.  
  244.   let "page = $page + 1"
  245. done
  246.  
  247. # download inverse 'fans' page(s)
  248. page=0
  249. max_page=0
  250. while [[ $page -le $max_page ]]
  251. do
  252.   echo " - inverse fans page $page"
  253.   # download page
  254.   $WGET -U "$USER_AGENT" --max-redirect=0 -O $PROFILE_DIR/inverse_fans_${page}.html "http://www.friendster.com/fans.php?uid=$PROFILE_ID&page=${page}&action=spusers"
  255.  
  256.   # get page links
  257.   page_numbers=`grep -o -E "/fans\.php\?page=[0-9]+" $PROFILE_DIR/inverse_fans_${page}.html | grep -o -E "[0-9]+"`
  258.   # update max page number
  259.   for new_page_num in $page_numbers
  260.   do
  261.     if [[ $max_page -lt $new_page_num ]]
  262.     then
  263.       max_page=$new_page_num
  264.     fi
  265.   done
  266.  
  267.   let "page = $page + 1"
  268. done
  269.  
  270. # download 'comments' page(s)
  271. page=0
  272. max_page=0
  273. while [[ $page -le $max_page ]]
  274. do
  275.   echo " - comments page $page"
  276.   # download page
  277.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/comments_${page}.html "http://www.friendster.com/comments.php?uid=$PROFILE_ID&page=${page}" --keep-session-cookies --save-cookies cookies.txt --load-cookies cookies.txt
  278.  
  279.   # get page links
  280.   page_numbers=`grep -o -E "/comments\.php\?page=[0-9]+" $PROFILE_DIR/comments_${page}.html | grep -o -E "[0-9]+"`
  281.   # update max page number
  282.   for new_page_num in $page_numbers
  283.   do
  284.     if [[ $max_page -lt $new_page_num ]]
  285.     then
  286.       max_page=$new_page_num
  287.     fi
  288.   done
  289.  
  290.   let "page = $page + 1"
  291. done
  292.  
  293. # download shoutout stream
  294. page=1
  295. shouts=0
  296. number_of_shouts=1
  297. while [[ $shouts -lt $number_of_shouts ]]
  298. do
  299.   echo " - shoutout stream $page"
  300.   # download page
  301.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/shoutout_${page}.html "http://www.friendster.com/shoutoutstream.php?uid=$PROFILE_ID&page=$PAGE"
  302.  
  303.   number=`grep -o -E "totalShoutouts = [0-9]+" $PROFILE_DIR/shoutout_${page}.html | grep -o -E "[0-9]+"`
  304.   if [[ $number_of_shouts -lt $number ]]
  305.   then
  306.     number_of_shouts=$number
  307.   fi
  308.  
  309.   let "shouts = $shouts + 20"
  310.   let "page = $page + 1"
  311. done
  312.  
  313. # download shout comments, if any
  314. SIDS=`grep -o -E "shoutoutstream\.php\?sid=[0-9]+&" $PROFILE_DIR/shoutout_*.html | grep -o -E "[0-9]+" | sort | uniq`
  315. for sid in $SIDS
  316. do
  317.   echo " - shoutout comments for $sid"
  318.   # download
  319.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/shoutout_sid_$sid.html "http://www.friendster.com/shoutoutstream.php?sid=$sid&uid=$PROFILE_ID"
  320.  
  321.   # find even more comments
  322.   authcode=`grep -o -E "var _ac = '[0-9a-z]+'" $PROFILE_DIR/shoutout_sid_$sid.html | grep -o -E "[0-9a-z]{10,}"`
  323.   eid=$PROFILE_ID
  324.   uid=$PROFILE_ID
  325.   eeid=$sid
  326.   last_page=`grep -o -E "currentCommentPage = [0-9]+" $PROFILE_DIR/shoutout_sid_$sid.html | grep -o -E "[0-9]+"`
  327.  
  328.   page=0
  329.   while [[ $page -le $last_page ]]
  330.   do
  331.     $WGET -U "$USER_AGENT" -O $PROFILE_DIR/shoutout_sid_${sid}_comment_$page.json "http://www.friendster.com/rpc.php" --post-data="rpctype=fetchcomments&authcode=$authcode&page=$page&ct=5&eid=$eid&uid=$uid&eeid=$eeid"
  332.  
  333.     let "page = $page + 1"
  334.   done
  335. done
  336.  
  337. # check for a blog, if we haven't seen a link so far
  338. if [[ ! "$blog_url" =~ http:// ]]
  339. then
  340.   $WGET -U "$USER_AGENT" -O $PROFILE_DIR/module_13.html "http://profiles.friendster.com/modules/module.php?uid=$PROFILE_ID&_pmr=&_pmmo=13"
  341.  
  342.   blog_url=`cat $PROFILE_DIR/module_13.html | grep -o -E "http://[^\"]+\.blogs?\.friendster\.com/" | uniq`
  343.   if [[ "$blog_url" =~ http:// ]]
  344.   then
  345.     echo $blog_url > $PROFILE_DIR/blog_url.txt
  346.   fi
  347. fi
  348.  
  349. # download the blog, if it exists
  350. if [[ "$blog_url" =~ http:// ]]
  351. then
  352.   # strip http:// and trailing slash
  353.   blog_domain=${blog_url#http://}
  354.   blog_domain=${blog_domain%/}
  355.  
  356.   mkdir -p $PROFILE_DIR/blog
  357.  
  358.   echo " - blog: $blog_url"
  359.   wget --directory-prefix="$PROFILE_DIR/blog/" \
  360.        -a "$PROFILE_DIR/wget.log" \
  361.        -nv --mirror -np -E -H -k -K -p \
  362.        -D "$blog_domain" http://$blog_domain/ \
  363.        -U "$USER_AGENT"
  364. fi
  365.  
  366.  
  367. END=$(date +%s)
  368. DIFF=$(( $END - $START ))
  369.  
  370. echo " Profile $PROFILE_ID done. ($DIFF seconds)"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement