Advertisement
Guest User

Untitled

a guest
Apr 17th, 2015
296
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 6.79 KB | None | 0 0
  1. ###
  2. #File utils framework for appending, deleting and poping lines from/to a file.
  3. #Call fileUtils() first to load all functions first.
  4. #
  5. #Argument order is following: line / string first
  6. #
  7. #Petrroll 2015
  8. ###
  9. fileUtils(){
  10.  
  11. #Appends line ($1) to a file ($2) and return 0 if it's not in it yet; if it, is just return 1
  12. fuAppend(){
  13. if grep -Fxq "$1" < "$2"; then
  14.         return 1
  15. else
  16.         echo "$1" >> "$2"
  17.         return 0
  18. fi
  19. }
  20.  
  21.  
  22. #Pops first element from a file ($1) and prints it to stdout; if file emepty print out empty return 1
  23. fuPop(){
  24. if [ -s "$1" ]; then
  25.         sed -nr '1p' "$1"
  26.         sed -ir '1d' "$1"
  27.         return 0
  28. else
  29.         return 1
  30. fi
  31. }
  32.  
  33.  
  34. #Removes line ($1) from a file ($2) and returns 0, returns 1 if line does not exists yet return 1
  35. fuRemove(){
  36. lineNum=$(grep -Fxn "$1" < "$2" | cut -d : -f 1)
  37. if [ -n "$lineNum" ]; then
  38.         sed -ir "${lineNum}d" "$2"
  39.         return 0
  40. else
  41.         return 1
  42. fi
  43. }
  44.  
  45. #Increments variable inside a file ($1)
  46. fuInc(){
  47. var="$(cat "$1")"
  48. echo "$(($var + 1))" | cat > "$1"
  49. }
  50.  
  51. #Decrements variable inside a file; returns 1 if variable already 0
  52. fuDec(){
  53. var="$(cat "$1")"
  54. if [ $var -eq 0 ]; then
  55.         return 1
  56. else
  57.         var="$(cat "$1")"
  58.         echo "$(($var - 1))" | cat > "$1"
  59.         return 0
  60. fi
  61. }
  62.  
  63. #Test function
  64. fuTest(){
  65.         echo "Testing 16 cases..."
  66.         tstFile=$(mktemp)
  67.         if fuPop "$tstFile"; then :; else echo "Passed 1"; fi
  68.         if fuAppend "String zero" "$tstFile"; then echo "Passed 2"; fi
  69.         if fuAppend "String one" "$tstFile"; then echo "Passed 3"; fi
  70.         if fuAppend "String zero" "$tstFile"; then :; else echo "Passed 4"; fi
  71.         if [ "$(fuPop "$tstFile")" = "String zero" ]; then echo "Passed 5"; fi
  72.         if fuRemove "String zero" "$tstFile"; then :; else echo "Passed 6"; fi
  73.         if fuRemove "String one" "$tstFile"; then echo "Passed 7"; fi
  74.         if fuPop "$tstFile"; then :; else echo "Passed 8"; fi
  75.  
  76.         echo "2" | cat > "$tstFile"
  77.  
  78.         fuInc "$tstFile"
  79.         if [ $(cat "$tstFile") -eq 3 ]; then echo "Passed 9"; fi
  80.  
  81.         fuInc "$tstFile"
  82.         if [ $(cat "$tstFile") -eq 4 ]; then echo "Passed 10"; fi
  83.  
  84.         fuDec "$tstFile"
  85.         if [ $(cat "$tstFile") -eq 3 ]; then echo "Passed 11"; fi
  86.  
  87.         fuDec "$tstFile"
  88.         fuDec "$tstFile"
  89.         if [ $(cat "$tstFile") -eq 1 ]; then echo "Passed 12"; fi
  90.  
  91.         if fuDec "$tstFile"; then echo "Passed 13"; fi
  92.         if [ $(cat "$tstFile") -eq 0 ]; then echo "Passed 14"; fi
  93.  
  94.         if fuDec "$tstFile"; then echo :; else echo "Passed 15"; fi
  95.         if [ $(cat "$tstFile") -eq 0 ]; then echo "Passed 16"; fi
  96.  
  97.         echo "16 test cases finished."
  98.         rm "$tstFile"
  99. }
  100.  
  101. if [ "$1" = "-t" ]; then
  102.         fuTest
  103. fi
  104.  
  105. }
  106. ###
  107. #End of file utilities (fu) framework.
  108. ###
  109.  
  110. ###
  111. #Lock utils
  112. ###
  113. lckUtils(){
  114.  
  115. #Returns random lock dir path
  116. luRndLockPath(){
  117. rndLockPath="$(mktemp -d)"
  118. echo "$rndLockPath"
  119. rmdir "$rndLockPath"
  120. }
  121.  
  122. #Acquires a lock (or waits until lock is available) specified by dir path in $1 (sleep duration in $2 is optional; implicitly 50ms)
  123. luAckLock(){
  124. while ! mkdir "$1" 2> /dev/null; do
  125.         sleep ${2:-0.050}
  126. done
  127. }
  128.  
  129. #Lifts a lock specified by dir path in $1
  130. luUnlock(){
  131. rmdir "$1" 2> /dev/null
  132. }
  133.  
  134. }
  135. ###
  136. #End of lock utils framework
  137. ###
  138.  
  139.  
  140. ###
  141. #INIT & CLEANUP
  142. ###
  143. crawlersInit(){
  144. fileUtils
  145. lckUtils
  146.  
  147. isDone=0
  148.  
  149. doneEmails="$(mktemp)"
  150. doneLinks="$(mktemp)"
  151. todoLinks="$(mktemp)"
  152.  
  153. linksLock="$(luRndLockPath)"
  154. mailsLock="$(luRndLockPath)"
  155.  
  156. fuAppend "$1" "$doneLinks"
  157. fuAppend "$1" "$todoLinks"
  158.  
  159. crwlrsToCreate="${2:-1}"
  160. crwlrsActive=$(mktemp)
  161. }
  162.  
  163. crawlersClean(){
  164. rm "$doneEmails"
  165. rm "$doneLinks"
  166. rm "$todoLinks"
  167.  
  168. rm "$crwlrsActive"
  169. }
  170. ###
  171. #END
  172. ###
  173.  
  174.  
  175. ###
  176. #Parse email & links
  177. ###
  178.  
  179. ##
  180. #Return all links on stdout; expects current address in $1
  181. ##
  182. parseLinks(){
  183. dirAddress="$(echo $1 | sed -r "/^[^/]*\/\/[^/]*$/ { s|$|/|; }" | grep -o ".*/")"
  184. domainAddress="$(echo $1 | egrep -o "http[s]?://[^/]+")"
  185.  
  186. sed -r 's/<a ?href="mailto:"//' |\
  187. egrep -o '<a ?href="[^"]*' | sed -rn 's/^<a ?href="//p' |\
  188. sed -nr "
  189.        /\#/ { s/#.*$// }
  190.        /^\// { s|^|${domainAddress}|; p; d; }
  191.        /(^http[s]?:)/ ! { s|^|${dirAddress}|; p; d; }
  192.        /^www\./ { s|^|http://|; p; d; }
  193.        /.+/ { p; }
  194. "
  195. }
  196.  
  197. ##
  198. #Returns all email adresses on stdout
  199. ##
  200. parseEmails(){
  201. egrep -o '[a-zA-Z0-9\-_~!$&()*+,;=:]+([.][a-zA-Z0-9\-_~!$&()*+,;=:]+)*@[a-zA-Z0-9-]+([.][a-zA-Z0-9-]+)*\.[a-z]+' |\
  202. sed -r 's/^.*mailto://'
  203. }
  204.  
  205.  
  206. ###
  207. #END
  208. ###
  209. ###
  210. #Process new emails & links
  211. ###
  212.  
  213. prcsNewEmails(){
  214. echo "LOCK EMAILS $mailsLock"
  215. luAckLock "$mailsLock"
  216. for newEmail in $1; do
  217.         if fuAppend "$newEmail" "$doneEmails"; then
  218.                 echo "$newEmail"
  219.         fi
  220. done
  221. luUnlock "$mailsLock"
  222. echo "UNLOCKED EMAILS $mailsLock"
  223. }
  224.  
  225. prcsNewLinks(){
  226. echo "LOCK LINKS $linksLock"
  227. luAckLock "$linksLock"
  228. for newLink in $1; do
  229.         if fuAppend "$newLink" "$doneLinks"; then
  230.                 fuAppend "$newLink" "$todoLinks"
  231.         fi
  232. done
  233. luUnlock "$linksLock"
  234. echo "UNLOCKED LINKS $linksLock"
  235. }
  236. ##
  237. #END
  238. ##
  239.  
  240.  
  241. ###
  242. #Process page & pages
  243. ###
  244.  
  245. ##
  246. #Processes page with given url ($1)
  247. ##
  248. prcsPage(){
  249.  echo "__${BASHPID}__CURR PAGE:$1" #DEBUG
  250. pageSrc=$(curl "$1" 2> /dev/null)
  251.  
  252. emails="$(echo "$pageSrc" | parseEmails)"
  253. links="$(echo "$pageSrc" | parseLinks "$1")"
  254.  
  255. prcsNewEmails "$emails"
  256. prcsNewLinks "$links"
  257. }
  258.  
  259. prcsPages(){
  260. while [ -s "$todoLinks" ]; do
  261.         echo "PRCS LINK LOCK $linksLock"
  262.         luAckLock "$linksLock"
  263.         linkToProcess="$(fuPop "$todoLinks")"
  264.         luUnlock "$linksLock"
  265.         echo "PRCS UNLOCK LINK $linksLock"
  266.  
  267.         prcsPage "$linkToProcess"
  268.         echo "Page done $parentPID"
  269.         kill -15 "$parentPID"
  270. done
  271. crwlrDied
  272. }
  273.  
  274. ##
  275. #END
  276. ##
  277.  
  278.  
  279. ###
  280. #Crawler init, number, and spawn
  281. ###
  282.  
  283. crwlrInit(){
  284. parentPID=$1
  285. prcsPages
  286. }
  287.  
  288. incCrwlrsNum(){
  289. crwlrsToCreate=$(($crwlrsToCreate + 1))
  290. }
  291.  
  292. decCrwlrsNum(){
  293. if [ $crwlrsToCreate -eq 0 ]; then
  294.         return 1
  295. else
  296.         crwlrsToCreate=$(($crwlrsToCreate - 1))
  297.         return 0
  298. fi
  299. }
  300.  
  301. spawnCrwlr(){
  302. crwlrInit $$ &
  303. fuAppend "$!" "$crwlrsActive"
  304. echo "?????????????????CRAWLER SPAWNED??????? $!"
  305. }
  306.  
  307. tryToSpawnCrwlr(){
  308. echo "CRAWLERS TO CREATE: $crwlrsToCreate"
  309. if decCrwlrsNum; then
  310.         spawnCrwlr
  311. fi
  312. }
  313. killCrawlers(){
  314. echo ":::::"
  315. cat "$crwlrsActive"
  316. echo ":::::"
  317. for crwlrPID in $(cat "$crwlrsActive"); do
  318.         kill -9 $crwlrPID
  319. done
  320. crwlrsToCreate=0
  321. isDone=1
  322. }
  323.  
  324. crwlrDied(){
  325. fuRemove "$$" "$crwlrsActive"
  326. kill -14 "$parentPID"
  327. }
  328.  
  329. ##
  330. #END
  331. ##
  332.  
  333. trap 'killCrawlers' 2
  334. trap 'tryToSpawnCrwlr' 15
  335. trap 'echo "Crawler died"; incCrwlrsNum' 14
  336.  
  337. crawlersInit "$1" "$2"
  338. tryToSpawnCrwlr
  339.  
  340. while [ "$isDone" -eq 0 ]; do
  341.         wait
  342. done;
  343.  
  344.  
  345. crawlersClean
  346. exit
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement