Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env bash
- NUTCH_DIR="$1"
- CRAWL_DIR="$2"
- LINK_DB="$3"
- ROUNDS="$4"
- if [ "$#" -ne 4 ]; then
- echo "Usage: $0 <nutch directory> <crawl path> <link-db path> <number of rounds>"
- exit -1
- fi
- NUTCH_BIN="${NUTCH_DIR}/bin/nutch"
- CRAWL_DB="${CRAWL_DIR}/db"
- CRAWL_SEGMENTS="${CRAWL_DIR}/segments"
- if [ -z "${JAVA_HOME}" ]; then
- export JAVA_HOME=/usr/lib/jvm/default-java
- fi
- cd "${NUTCH_DIR}"
- for ((a=1; a <= ROUNDS; a++)); do
- echo "${NUTCH_BIN} generate ${CRAWL_DB} ${CRAWL_SEGMENTS} -topN 250000 -numFetchers 5"
- ${NUTCH_BIN} generate ${CRAWL_DB} ${CRAWL_SEGMENTS} -topN 250000 -numFetchers 5
- LAST_SEGMENT=`hdfs dfs -ls ${CRAWL_SEGMENTS} | tail -1 | awk '{print $8}'`
- echo "${NUTCH_BIN} fetch ${LAST_SEGMENT} -threads 20"
- ${NUTCH_BIN} fetch ${LAST_SEGMENT} -threads 20
- echo "${NUTCH_BIN} parse ${LAST_SEGMENT}"
- ${NUTCH_BIN} parse ${LAST_SEGMENT}
- echo "${NUTCH_BIN} updatedb ${CRAWL_DB} ${LAST_SEGMENT}"
- ${NUTCH_BIN} updatedb ${CRAWL_DB} ${LAST_SEGMENT}
- done
- echo "${NUTCH_BIN} invertlinks ${LINK_DB} -dir ${CRAWL_SEGMENTS}"
- ${NUTCH_BIN} invertlinks ${LINK_DB} -dir ${CRAWL_SEGMENTS}
- echo "NUTCH_OPTS=-Delastic.cluster=elasticsearch ${NUTCH_BIN} index ${CRAWL_DB} -linkdb ${LINK_DB} -dir ${CRAWL_SEGMENTS}"
- NUTCH_OPTS=-Delastic.cluster=elasticsearch ${NUTCH_BIN} index ${CRAWL_DB} -linkdb ${LINK_DB} -dir ${CRAWL_SEGMENTS}
- echo "${NUTCH_BIN} clean ${CRAWL_DB}"
- ${NUTCH_BIN} clean ${CRAWL_DB}
- exit 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement