Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---- down.sh
- #!/bin/bash
- dwnin () {
- if [ -f neg/"$1" -o -f words/"$1" ]; then
- echo "$1 ..is neg" >&2
- echo /dev/null
- else
- TMP1=$(tempfile .)
- # export http_proxy=$(head -n $(($RANDOM % $(wc -l <proxy.txt))) proxy.txt | tail -1)
- gtimeout -k 5 60 wget \
- --user-agent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" \
- --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" \
- --header="Accept-Language: en-US,en;q=0.8,pl;q=0.6" \
- --referer="http://prirucka.ujc.cas.cz/" \
- --timeout=10 -q -O ${TMP1} "http://slowniki.lingea.pl/Czesko-polski/$1"
- if [ $? -gt 0 -o $(wc -c < ${TMP1}) -eq 0 ] ; then
- echo ""
- echo "$1" $? tmp >&2
- rm -f $TMP1
- elif grep -E -q 'no_entry_found' ${TMP1}; then
- echo "$1" neg >&2
- :> neg/"$1"
- rm -f $TMP1
- echo /dev/null
- else
- echo "$1 ..ok" >&2
- echo $TMP1
- fi
- fi
- }
- dwn () {
- echo "$1 ..start"
- TMP2=
- while [ -z "$TMP2" ]; do TMP2=$(dwnin "$1"); done
- if [ -s $TMP2 ] ; then
- NAME=$(md5sum $TMP2 | awk '{print $1}')
- mv $TMP2 defs/${NAME}
- (cd words && ln -sf ../defs/${NAME} "$1")
- echo "$1 ..ok" >&2
- else
- touch neg/"$1"
- if [ "${TMP2}" != /dev/null ]; then rm -f $TMP2 ; fi
- fi
- }
- vrf () {
- awk 'length < 16 { print }' | \
- while read f; do
- if [ ! -f neg/"$f" -a ! -f "words"/"$f" ]; then echo "$f" ; fi
- done
- }
- export LD_PRELOAD
- export -f dwn
- export -f dwnin
- if [ "$MAXPROC" = "" ]; then
- MAXPROC=5
- fi
- if [ "$DELAY" = "" ]; then
- DELAY=1
- fi
- if [ "$FAILDELAY" = "" ]; then
- FAILDELAY=0
- fi
- export FAILDELAY
- vrf | parallel --delay ${DELAY} -q --line-buffer -j${MAXPROC} dwn
- ---- defmake.sh
- #!/bin/sh
- # uruchamiać w words/
- # tylko na osx
- MACFIX='iconv -f utf-8-mac -t utf-8'
- # na u*x
- MACFIX=cat
- HSDICT=cs_CZ_u8
- for def in *; do
- echo ${def}
- TMPFILE=$(tempfile .)
- xmllint --html --xpath '//div[@id="Entry_update"]' ${def} 2>/dev/null | \
- sed -E -e 's/<input[^>]+>//g' \
- -e 's/span +class=.ssc_tt.>.<.span>//g' \
- -e 's/<br>/<br\/>/g' \
- -e 's/<img [^>]+>//g' \
- -e 's/<\/?(table|tr)[^>]*>//g' \
- -e 's/<(\/?)td[^>]*>/<\1div>/g' | \
- tidy -utf8 -wrap 0 -xml -asxml 2>/dev/null | \
- grep -v -E '(<!DOCTYPE|html>|head>|<meta|title>|body>)' \
- > ${TMPFILE}
- MD5=$(md5 -q ${TMPFILE})
- mv ${TMPFILE} ../df/${MD5}
- echo ${def} | ${MACFIX} | hunspell -s -d ${HSDICT} | awk 'NF > 0 {print $NF}' >> ../df/${MD5}.def
- done
- ---- bldmbp.sh
- #!/bin/sh
- # uruchamiać w words/
- # tylko na osx
- MACFIX='iconv -f utf-8-mac -t utf-8'
- # na u*x
- MACFIX=cat
- AFF=../../../../../../../../Library/Spelling/cs_CZ_u8.aff
- DIC=/Library/Spelling/cs_CZ_u8.dic
- cat << __EOIN
- <html>
- <head>
- <meta http-equiv="content-type"
- content="text/html; charset=utf-8" />
- <guide>
- <reference title="Title Page" type="title-page"
- filepos="0000000110" />
- </guide>
- </head>
- <body>
- __EOIN
- for def in *.def; do
- echo ${def} >&2
- echo '<idx:entry scriptable="yes">'
- for ort in $(cat ${def}); do
- echo '<idx:orth value="'$(echo ${ort} | xml esc)'">'
- wordforms ${AFF} ${DIC} ${ort} | xml esc | sed -e 's/^/<idx:infl value="/g' -e 's/$/" \/>/g'
- echo '</idx:orth>'
- done
- cat $(basename ${def} .def)
- echo "</idx:entry>"
- done
- cat << __EOOUT
- </body>
- </html>
- __EOOUT
Advertisement
Add Comment
Please, Sign In to add comment