Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###
- #File utils framework for appending, deleting and poping lines from/to a file.
- #Call fileUtils() first to load all functions first.
- #
- #Argument order is following: line / string first
- #
- #Petrroll 2015
- ###
- fileUtils(){
- #Appends line ($1) to a file ($2) and return 0 if it's not in it yet; if it, is just return 1
- fuAppend(){
- if grep -Fxq "$1" < "$2"; then
- return 1
- else
- echo "$1" >> "$2"
- return 0
- fi
- }
- #Pops first element from a file ($1) and prints it to stdout; if file emepty print out empty return 1
- fuPop(){
- if [ -s "$1" ]; then
- sed -nr '1p' "$1"
- sed -ir '1d' "$1"
- return 0
- else
- return 1
- fi
- }
- #Removes line ($1) from a file ($2) and returns 0, returns 1 if line does not exists yet return 1
- fuRemove(){
- lineNum=$(grep -Fxn "$1" < "$2" | cut -d : -f 1)
- if [ -n "$lineNum" ]; then
- sed -ir "${lineNum}d" "$2"
- return 0
- else
- return 1
- fi
- }
- #Increments variable inside a file ($1)
- fuInc(){
- var="$(cat "$1")"
- echo "$(($var + 1))" | cat > "$1"
- }
- #Decrements variable inside a file; returns 1 if variable already 0
- fuDec(){
- var="$(cat "$1")"
- if [ $var -eq 0 ]; then
- return 1
- else
- var="$(cat "$1")"
- echo "$(($var - 1))" | cat > "$1"
- return 0
- fi
- }
- #Test function
- fuTest(){
- echo "Testing 16 cases..."
- tstFile=$(mktemp)
- if fuPop "$tstFile"; then :; else echo "Passed 1"; fi
- if fuAppend "String zero" "$tstFile"; then echo "Passed 2"; fi
- if fuAppend "String one" "$tstFile"; then echo "Passed 3"; fi
- if fuAppend "String zero" "$tstFile"; then :; else echo "Passed 4"; fi
- if [ "$(fuPop "$tstFile")" = "String zero" ]; then echo "Passed 5"; fi
- if fuRemove "String zero" "$tstFile"; then :; else echo "Passed 6"; fi
- if fuRemove "String one" "$tstFile"; then echo "Passed 7"; fi
- if fuPop "$tstFile"; then :; else echo "Passed 8"; fi
- echo "2" | cat > "$tstFile"
- fuInc "$tstFile"
- if [ $(cat "$tstFile") -eq 3 ]; then echo "Passed 9"; fi
- fuInc "$tstFile"
- if [ $(cat "$tstFile") -eq 4 ]; then echo "Passed 10"; fi
- fuDec "$tstFile"
- if [ $(cat "$tstFile") -eq 3 ]; then echo "Passed 11"; fi
- fuDec "$tstFile"
- fuDec "$tstFile"
- if [ $(cat "$tstFile") -eq 1 ]; then echo "Passed 12"; fi
- if fuDec "$tstFile"; then echo "Passed 13"; fi
- if [ $(cat "$tstFile") -eq 0 ]; then echo "Passed 14"; fi
- if fuDec "$tstFile"; then echo :; else echo "Passed 15"; fi
- if [ $(cat "$tstFile") -eq 0 ]; then echo "Passed 16"; fi
- echo "16 test cases finished."
- rm "$tstFile"
- }
- if [ "$1" = "-t" ]; then
- fuTest
- fi
- }
- ###
- #End of file utilities (fu) framework.
- ###
- ###
- #Lock utils
- ###
- lckUtils(){
- #Returns random lock dir path
- luRndLockPath(){
- rndLockPath="$(mktemp -d)"
- echo "$rndLockPath"
- rmdir "$rndLockPath"
- }
- #Acquires a lock (or waits until lock is available) specified by dir path in $1 (sleep duration in $2 is optional; implicitly 50ms)
- luAckLock(){
- while ! mkdir "$1" 2> /dev/null; do
- sleep ${2:-0.050}
- done
- }
- #Lifts a lock specified by dir path in $1
- luUnlock(){
- rmdir "$1" 2> /dev/null
- }
- }
- ###
- #End of lock utils framework
- ###
- ###
- #INIT & CLEANUP
- ###
- crawlersInit(){
- fileUtils
- lckUtils
- isDone=0
- doneEmails="$(mktemp)"
- doneLinks="$(mktemp)"
- todoLinks="$(mktemp)"
- linksLock="$(luRndLockPath)"
- mailsLock="$(luRndLockPath)"
- fuAppend "$1" "$doneLinks"
- fuAppend "$1" "$todoLinks"
- crwlrsToCreate="${2:-1}"
- crwlrsActive=$(mktemp)
- }
- crawlersClean(){
- rm "$doneEmails"
- rm "$doneLinks"
- rm "$todoLinks"
- rm "$crwlrsActive"
- }
- ###
- #END
- ###
- ###
- #Parse email & links
- ###
- ##
- #Return all links on stdout; expects current address in $1
- ##
- parseLinks(){
- dirAddress="$(echo $1 | sed -r "/^[^/]*\/\/[^/]*$/ { s|$|/|; }" | grep -o ".*/")"
- domainAddress="$(echo $1 | egrep -o "http[s]?://[^/]+")"
- sed -r 's/<a ?href="mailto:"//' |\
- egrep -o '<a ?href="[^"]*' | sed -rn 's/^<a ?href="//p' |\
- sed -nr "
- /\#/ { s/#.*$// }
- /^\// { s|^|${domainAddress}|; p; d; }
- /(^http[s]?:)/ ! { s|^|${dirAddress}|; p; d; }
- /^www\./ { s|^|http://|; p; d; }
- /.+/ { p; }
- "
- }
- ##
- #Returns all email adresses on stdout
- ##
- parseEmails(){
- egrep -o '[a-zA-Z0-9\-_~!$&()*+,;=:]+([.][a-zA-Z0-9\-_~!$&()*+,;=:]+)*@[a-zA-Z0-9-]+([.][a-zA-Z0-9-]+)*\.[a-z]+' |\
- sed -r 's/^.*mailto://'
- }
- ###
- #END
- ###
- ###
- #Process new emails & links
- ###
- prcsNewEmails(){
- echo "LOCK EMAILS $mailsLock"
- luAckLock "$mailsLock"
- for newEmail in $1; do
- if fuAppend "$newEmail" "$doneEmails"; then
- echo "$newEmail"
- fi
- done
- luUnlock "$mailsLock"
- echo "UNLOCKED EMAILS $mailsLock"
- }
- prcsNewLinks(){
- echo "LOCK LINKS $linksLock"
- luAckLock "$linksLock"
- for newLink in $1; do
- if fuAppend "$newLink" "$doneLinks"; then
- fuAppend "$newLink" "$todoLinks"
- fi
- done
- luUnlock "$linksLock"
- echo "UNLOCKED LINKS $linksLock"
- }
- ##
- #END
- ##
- ###
- #Process page & pages
- ###
- ##
- #Processes page with given url ($1)
- ##
- prcsPage(){
- echo "__${BASHPID}__CURR PAGE:$1" #DEBUG
- pageSrc=$(curl "$1" 2> /dev/null)
- emails="$(echo "$pageSrc" | parseEmails)"
- links="$(echo "$pageSrc" | parseLinks "$1")"
- prcsNewEmails "$emails"
- prcsNewLinks "$links"
- }
- prcsPages(){
- while [ -s "$todoLinks" ]; do
- echo "PRCS LINK LOCK $linksLock"
- luAckLock "$linksLock"
- linkToProcess="$(fuPop "$todoLinks")"
- luUnlock "$linksLock"
- echo "PRCS UNLOCK LINK $linksLock"
- prcsPage "$linkToProcess"
- echo "Page done $parentPID"
- kill -15 "$parentPID"
- done
- crwlrDied
- }
- ##
- #END
- ##
- ###
- #Crawler init, number, and spawn
- ###
- crwlrInit(){
- parentPID=$1
- prcsPages
- }
- incCrwlrsNum(){
- crwlrsToCreate=$(($crwlrsToCreate + 1))
- }
- decCrwlrsNum(){
- if [ $crwlrsToCreate -eq 0 ]; then
- return 1
- else
- crwlrsToCreate=$(($crwlrsToCreate - 1))
- return 0
- fi
- }
- spawnCrwlr(){
- crwlrInit $$ &
- fuAppend "$!" "$crwlrsActive"
- echo "?????????????????CRAWLER SPAWNED??????? $!"
- }
- tryToSpawnCrwlr(){
- echo "CRAWLERS TO CREATE: $crwlrsToCreate"
- if decCrwlrsNum; then
- spawnCrwlr
- fi
- }
- killCrawlers(){
- echo ":::::"
- cat "$crwlrsActive"
- echo ":::::"
- for crwlrPID in $(cat "$crwlrsActive"); do
- kill -9 $crwlrPID
- done
- crwlrsToCreate=0
- isDone=1
- }
- crwlrDied(){
- fuRemove "$$" "$crwlrsActive"
- kill -14 "$parentPID"
- }
- ##
- #END
- ##
- trap 'killCrawlers' 2
- trap 'tryToSpawnCrwlr' 15
- trap 'echo "Crawler died"; incCrwlrsNum' 14
- crawlersInit "$1" "$2"
- tryToSpawnCrwlr
- while [ "$isDone" -eq 0 ]; do
- wait
- done;
- crawlersClean
- exit
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement