Guest User

Untitled

a guest
Jun 7th, 2016
87
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ---- down.sh
  2. #!/bin/bash
  3.  
  4. dwnin () {
  5.  
  6.   if [ -f neg/"$1" -o -f words/"$1" ]; then
  7.     echo "$1 ..is neg" >&2
  8.     echo /dev/null
  9.   else
  10.  
  11.     TMP1=$(tempfile .)
  12. #    export http_proxy=$(head -n $(($RANDOM % $(wc -l <proxy.txt))) proxy.txt  | tail -1)
  13.     gtimeout -k 5 60 wget \
  14.       --user-agent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" \
  15.       --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" \
  16.       --header="Accept-Language: en-US,en;q=0.8,pl;q=0.6" \
  17.       --referer="http://prirucka.ujc.cas.cz/" \
  18.       --timeout=10 -q -O ${TMP1} "http://slowniki.lingea.pl/Czesko-polski/$1"
  19.  
  20.  
  21.     if [ $? -gt 0 -o $(wc -c < ${TMP1}) -eq 0 ] ; then
  22.       echo ""
  23.       echo "$1" $? tmp >&2
  24.       rm -f $TMP1
  25.     elif grep -E -q 'no_entry_found' ${TMP1}; then
  26.       echo "$1" neg >&2
  27.       :> neg/"$1"
  28.       rm -f $TMP1
  29.       echo /dev/null
  30.     else
  31.       echo "$1 ..ok" >&2
  32.       echo $TMP1
  33.     fi
  34.   fi
  35. }
  36.  
  37. dwn () {
  38.   echo "$1 ..start"
  39.  
  40.   TMP2=
  41.  
  42.   while [ -z "$TMP2" ]; do TMP2=$(dwnin "$1"); done
  43.  
  44.   if [ -s $TMP2 ] ; then
  45.     NAME=$(md5sum $TMP2 | awk '{print $1}')
  46.  
  47.     mv $TMP2 defs/${NAME}
  48.     (cd words && ln -sf ../defs/${NAME} "$1")
  49.     echo "$1 ..ok" >&2
  50.   else
  51.     touch neg/"$1"
  52.     if [ "${TMP2}" != /dev/null ]; then rm -f $TMP2 ; fi
  53.   fi
  54. }
  55.  
  56. vrf () {
  57.  awk 'length < 16 { print }' | \
  58.  while read f; do
  59.    if [ ! -f neg/"$f" -a ! -f "words"/"$f" ]; then echo "$f" ; fi
  60.  done
  61. }
  62.  
  63. export LD_PRELOAD
  64. export -f dwn
  65. export -f dwnin
  66.  
  67. if [ "$MAXPROC" = "" ]; then
  68.   MAXPROC=5
  69. fi
  70.  
  71. if [ "$DELAY" = "" ]; then
  72.   DELAY=1
  73. fi
  74.  
  75. if [ "$FAILDELAY" = "" ]; then
  76.   FAILDELAY=0
  77. fi
  78. export FAILDELAY
  79.  
  80. vrf | parallel --delay ${DELAY} -q --line-buffer  -j${MAXPROC} dwn
  81.  
  82.  
  83. ---- defmake.sh
  84. #!/bin/sh
  85.  
  86. # uruchamiać w words/
  87.  
  88. # tylko na osx
  89. MACFIX='iconv -f utf-8-mac -t utf-8'
  90. # na u*x
  91. MACFIX=cat
  92.  
  93. HSDICT=cs_CZ_u8
  94.  
  95.  
  96. for def in *; do
  97.  
  98.   echo ${def}
  99.  
  100.   TMPFILE=$(tempfile .)
  101.  
  102.   xmllint --html --xpath '//div[@id="Entry_update"]' ${def} 2>/dev/null | \
  103.     sed -E -e 's/<input[^>]+>//g' \
  104.         -e 's/span +class=.ssc_tt.>.<.span>//g' \
  105.         -e 's/<br>/<br\/>/g' \
  106.         -e 's/<img [^>]+>//g' \
  107.         -e 's/<\/?(table|tr)[^>]*>//g' \
  108.         -e 's/<(\/?)td[^>]*>/<\1div>/g' | \
  109.     tidy -utf8 -wrap 0 -xml -asxml  2>/dev/null  | \
  110.     grep -v -E '(<!DOCTYPE|html>|head>|<meta|title>|body>)' \
  111.     > ${TMPFILE}
  112.  
  113.   MD5=$(md5 -q ${TMPFILE})
  114.  
  115.   mv ${TMPFILE} ../df/${MD5}
  116.   echo ${def} | ${MACFIX} | hunspell -s -d ${HSDICT} | awk 'NF > 0 {print $NF}' >> ../df/${MD5}.def
  117.  
  118. done
  119.  
  120. ---- bldmbp.sh
  121. #!/bin/sh
  122.  
  123. # uruchamiać w words/
  124.  
  125. # tylko na osx
  126. MACFIX='iconv -f utf-8-mac -t utf-8'
  127. # na u*x
  128. MACFIX=cat
  129.  
  130. AFF=../../../../../../../../Library/Spelling/cs_CZ_u8.aff
  131. DIC=/Library/Spelling/cs_CZ_u8.dic
  132.  
  133. cat << __EOIN
  134. <html>
  135. <head>
  136. <meta http-equiv="content-type"
  137. content="text/html; charset=utf-8" />
  138. <guide>
  139. <reference title="Title Page" type="title-page"
  140. filepos="0000000110" />
  141. </guide>
  142. </head>
  143. <body>
  144. __EOIN
  145.  
  146.  
  147. for def in *.def; do
  148.   echo ${def} >&2
  149.   echo '<idx:entry scriptable="yes">'
  150.   for ort in $(cat ${def}); do
  151.     echo '<idx:orth value="'$(echo ${ort} | xml esc)'">'
  152.     wordforms ${AFF} ${DIC} ${ort} | xml esc | sed -e 's/^/<idx:infl value="/g' -e 's/$/" \/>/g'
  153.     echo '</idx:orth>'
  154.   done
  155.   cat $(basename ${def} .def)
  156.   echo "</idx:entry>"
  157. done
  158.  
  159. cat << __EOOUT
  160. </body>
  161. </html>
  162. __EOOUT
RAW Paste Data