Advertisement
Guest User

Untitled

a guest
Dec 20th, 2014
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.39 KB | None | 0 0
  1. #!/usr/bin/env bash
  2.  
  3. NUTCH_DIR="$1"
  4. CRAWL_DIR="$2"
  5. LINK_DB="$3"
  6. ROUNDS="$4"
  7.  
  8. if [ "$#" -ne 4 ]; then
  9. echo "Usage: $0 <nutch directory> <crawl path> <link-db path> <number of rounds>"
  10. exit -1
  11. fi
  12.  
  13. NUTCH_BIN="${NUTCH_DIR}/bin/nutch"
  14. CRAWL_DB="${CRAWL_DIR}/db"
  15. CRAWL_SEGMENTS="${CRAWL_DIR}/segments"
  16.  
  17. if [ -z "${JAVA_HOME}" ]; then
  18. export JAVA_HOME=/usr/lib/jvm/default-java
  19. fi
  20.  
  21. cd "${NUTCH_DIR}"
  22.  
  23. for ((a=1; a <= ROUNDS; a++)); do
  24. echo "${NUTCH_BIN} generate ${CRAWL_DB} ${CRAWL_SEGMENTS} -topN 250000 -numFetchers 5"
  25. ${NUTCH_BIN} generate ${CRAWL_DB} ${CRAWL_SEGMENTS} -topN 250000 -numFetchers 5
  26. LAST_SEGMENT=`hdfs dfs -ls ${CRAWL_SEGMENTS} | tail -1 | awk '{print $8}'`
  27. echo "${NUTCH_BIN} fetch ${LAST_SEGMENT} -threads 20"
  28. ${NUTCH_BIN} fetch ${LAST_SEGMENT} -threads 20
  29. echo "${NUTCH_BIN} parse ${LAST_SEGMENT}"
  30. ${NUTCH_BIN} parse ${LAST_SEGMENT}
  31. echo "${NUTCH_BIN} updatedb ${CRAWL_DB} ${LAST_SEGMENT}"
  32. ${NUTCH_BIN} updatedb ${CRAWL_DB} ${LAST_SEGMENT}
  33. done
  34.  
  35. echo "${NUTCH_BIN} invertlinks ${LINK_DB} -dir ${CRAWL_SEGMENTS}"
  36. ${NUTCH_BIN} invertlinks ${LINK_DB} -dir ${CRAWL_SEGMENTS}
  37. echo "NUTCH_OPTS=-Delastic.cluster=elasticsearch ${NUTCH_BIN} index ${CRAWL_DB} -linkdb ${LINK_DB} -dir ${CRAWL_SEGMENTS}"
  38. NUTCH_OPTS=-Delastic.cluster=elasticsearch ${NUTCH_BIN} index ${CRAWL_DB} -linkdb ${LINK_DB} -dir ${CRAWL_SEGMENTS}
  39. echo "${NUTCH_BIN} clean ${CRAWL_DB}"
  40. ${NUTCH_BIN} clean ${CRAWL_DB}
  41.  
  42. exit 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement