Advertisement
Guest User

小説を読もう ダウソ スクリプト 緑の狐

a guest
Jul 21st, 2012
180
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Awk 14.46 KB | None | 0 0
  1. # 限定的にYahoo,bingキャッシュに対応してみたお。narou.dip.jpとncode.syosetu.comは404処理を少し改良。
  2. #google cacheは多くが流れが速いのとタイムスタンプでの処理判定ができないのでやめました。もう殆ど古いキャッシュ流れてるし。
  3. #後はnXXXXyy の形式で一行おきにload.v2.txtに記したリストを元に、自動で取得していきます。
  4. #dry_run=1で、「この小説をお気に入り登録している人はこんな小説も読んでいます」のリンクはダウンロードから除外されます。
  5. #通常はdry_run=0;なので、自分の好みの作品+それら作品から辿れる作品、の両方がダウンロードされる。
  6. #作:緑の狐 (c)2012 CC-SAライセンスです。改良したら公開してね!
  7. cat "load.v2.txt" | awk -v R="/cygdrive/c/archive/web/ncode-syosetu-com-lastday" '
  8. function __lib_math__hex_to_base10(s, this,r,i,A){
  9.   r=0;A[0]=split(toupper(s),A,"");for(i=1;i<=A[0];i++)r=(r*16)+index("123456789ABCDEF",A[i]);return r;
  10. }
  11. function __lib_network__urldecode(s,this,A,i,r){r="";A[0]=split(s,A,"");for(i=1;i<=A[0];i++){if(A[i]=="%"){
  12.  r=r""sprintf("%c",__lib_math__hex_to_base10(A[i+1]A[i+2]));i+=2;}else{r=r""A[i];}}return r;
  13. }
  14. BEGIN{
  15.   dry_run=0;
  16.   GLOBAL_FAKEUA="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.9) Gecko/20100317 SeaMonkey/2.0.4";
  17.   LOAD="load.v2.txt";
  18.   IP_NAROU_DIP_JP="125.202.136.68";
  19. }
  20. function _log(s){print s >> "error.log";print s;}
  21. function _(s){print s}
  22. function grep( o, re, p, B, this, a,v ){
  23.  B[0]=0;if(o~"-v"){while((getline v < p)>0){if(!match(v,re))B[B[0]+=1]=v;}return B[0];};
  24.  if(o~"-o"){while((getline v < p)>0){a=v;while(match(a,re)){B[B[0]+=1]=substr(a,RSTART,RLENGTH);
  25.  a=substr(a,RSTART+RLENGTH);}};return B[0];};while((getline v < p)>0){if(match(v,re))B[B[0]+=1]=v;}return B[0];
  26. }
  27. function cmd( E, A, this,v){A[0]=0;while((E |getline v)>0)A[A[0]+=1]=v;A["RETURN_CODE"]=close(E);}
  28. function _realpath(p,tty, this,A){p=(p=="")?tty["pwd"]:p;cmd("cd \""tty["pwd"]"\" && realpath \""p"\"",A);return A[1];}
  29. function _pwd(tty){if(tty["shell"]=="")_virtual_console(tty);return tty["pwd"];}
  30. function _cd(p, tty, this,A,f){
  31.  if(tty["shell"]=="")_virtual_console(tty);p=(p=="")?"~":p;f=(substr(p,1,1)=="/");
  32.  if(system(((f)?"":"cd \""tty["pwd"]"\" && ")"[ -d \""p"\" ]")==0){
  33.  cmd(((f)?"":"cd \""tty["pwd"]"\" && ")"cd \""p"\" && pwd",A);tty["pwd"]=A[1];}return tty["pwd"];
  34. }
  35. function _mkdir(p,tty, this,f){if(p=="")return 1;
  36.  f=(substr(p,1,1)=="/");
  37.  if(tty["shell"]=="")_virtual_console(tty);
  38.  if(system(((f)?"":"cd \""tty["pwd"]"\" && ")"[ ! -e \""p"\" ]")==0){
  39.   return system(((f)?"":"cd \""tty["pwd"]"\" && ")"mkdir \""p"\"");
  40.  }else{
  41.  
  42.  }
  43. }
  44. function _virtual_console(tty, this){
  45.  tty["shell"]="sh";
  46.  tty["pwd"]="~";
  47.  tty["pwd"]=_realpath("~",tty);
  48. }
  49. function agrep( o, re, A, B, this, a, i,k ){
  50.  B[0]=0;k=0;if(o~"-v"){for(i=1;i<=A[0];i++){if(!match(A[i],re)) B[k+=1]=A[i];}B[0]=k;return k;};
  51.  if(o~"-o"){for(i=1;i<=A[0];i++){a=A[i];while(match(a,re)){B[B[0]+=1]=substr(a,RSTART,RLENGTH);a=substr(a,RSTART+RLENGTH);};
  52.  };B[0]=k;return k;};for(i=1;i<=A[0];i++){if(match(A[i],re))B[k+=1]=A[i];};B[0]=k;return k;
  53. }
  54. function dbg_printarray(ary , x , s,e, this , i ){x=(x=="")?"A":x;for(i=((s)?s:1);i<=((e)?e:ary[0]);i++){print x"["i"]=["ary[i]"]"}}
  55. function bugfix_try_to_get_page_from_anywhere( url , ROOT , file_path, this , A, fake , p ,B,C , i, x, fakename ){
  56.   cmd("wget -S -U \"\" -O \""file_path"\" \""url"\"",A);
  57.   if(A["RETURN_CODE"]){
  58.     #// 2048 == 404
  59.     # // try yahoo bing cache
  60.     p=url;
  61.     cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""file_path".search\" \"http://search.yahoo.com/search?p="p"&fr=sfp&fr2=&iscqry=\"",A);
  62.     grep("-o" , "href=[\"]/[^\"]*[\"]" , file_path".search" , B);for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
  63.     agrep( "" , "cache" , B, C);
  64.     if(C[0]==0){
  65.       _log("# Error : Failed to find yahoo bing cache. type=[no cache found] file_path=["file_path"] url=["url"]");
  66.       return 1;
  67.     }
  68.     x=C[1];
  69.     sub("^/[^*]*/[*][*]http%3a","http%3a",x);
  70.     x=__lib_network__urldecode(x);
  71.     cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""file_path".cache.html\" \""x"\"",A);
  72.     if(A["RETURN_CODE"]){
  73.       _log("# Error : Failed to dig up yahoo bing cache. file_path=["file_path"] url=["url"]");
  74.       return 1;
  75.     }
  76.     # patch page to refer to cache
  77.     #fakename = substr(file_path, match( file_path, "[^/]*$" ), RLENGTH);
  78.     #print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > file_path;
  79.     #print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> file_path;
  80.     #print "<object data=\""fakename".cache.html\" type=\"text/html\" width=\"100%\" height=\"100%\">" >> file_path;
  81.     #print "alt : <a href=\""fakename".cache.html\">"fakename".cache.html</a>" >> file_path;
  82.     #print "</object>" >> file_path;
  83.     #print "</body></html>" >> file_path;
  84.     #close(file_path);
  85.     system("touch \""file_path".generated\"");
  86.     system("mv \""file_path".cache.html\" \""file_path"\"");
  87.     return 0;
  88.   }
  89.   return 0;
  90. }
  91. function bugfix_try_to_get_leaf_from_anywhere(id, leaf_id , ROOT , leaf_path, this , A, fake , p ,B,C , i, url , fakename ){
  92.   cmd("wget -S -U \"\" -O \""leaf_path"\" \"http://ncode.syosetu.com/"id"/"leaf_id"/\"",A);
  93.   if(A["RETURN_CODE"]==2048){
  94.     # try narou.dip.jp
  95.     fake=""R"/http/ncode.syosetu.com/"id"/"id".zip";
  96.     A["RETURN_CODE"]=0;
  97.     if(system("[ -s \""fake"\" ]")){
  98.       # we do not have zip file, try to download
  99.       cmd("wget --header=\"Host: narou.dip.jp\" -S -U \"\" --post-data=\"ncode="id"\" -O \""fake"\" \"http://"IP_NAROU_DIP_JP"/download/send.php\"",A);
  100.     };
  101.     if(A["RETURN_CODE"]==0){
  102.       _("["id"] repaired, using narou.dip.jp cache");
  103.       if(system("unzip -p \""fake"\" \""toupper(id)"/"toupper(id)"-"leaf_id".txt\" > \""leaf_path".txt\" ")==0){
  104.         fakename = substr(leaf_path, match( leaf_path, "[^/]*$" ), RLENGTH);
  105.         # Now patch up the leaf
  106.         print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > leaf_path;
  107.         print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> leaf_path;
  108.         print "<object data=\""fakename".txt\" type=\"text/plain\" width=\"100%\" height=\"100%\">" >> leaf_path;
  109.         print "alt : <a href=\""fakename".txt\">"fakename".txt</a>" >> leaf_path;
  110.         print "</object>" >> leaf_path;
  111.         print "</body></html>" >> leaf_path;
  112.         close(leaf_path);
  113.         system("touch \""leaf_path".generated\"");
  114.         return 0;
  115.       }
  116.       # seems like unzip failed
  117.       # commented out, as we added yahoo cache as alternative source
  118.       #return 1;
  119.     }
  120.     # // try yahoo bing cache
  121.     p="http://ncode.syosetu.com/"id"/"((leaf_id)?""leaf_id"/":"");
  122.     cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""leaf_path".search\" \"http://search.yahoo.com/search?p="p"&fr=sfp&fr2=&iscqry=\"",A);
  123.     grep("-o","href=[\"]/[^\"]*[\"]",leaf_path".search",B);for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
  124.     agrep( "", "cache", B, C);
  125.     url=C[1];
  126.     sub("^/[^*]*/[*][*]http%3a","http%3a",url);
  127.     url=__lib_network__urldecode(url);
  128.     cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""leaf_path".cache.html\" \""url"\"",A);
  129.     if(A["RETURN_CODE"]){
  130.       _("# Error : Failed to dig up yahoo bing cache file id=["id"]leaf_id=["leaf_id"]url=["url"]");
  131.       return 1;
  132.     }
  133.     # patch leaf to refer to cache
  134.     print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > leaf_path;
  135.     print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> leaf_path;
  136.     print "<object data=\""leaf_id".cache.html\" type=\"text/html\" width=\"100%\" height=\"100%\">" >> leaf_path;
  137.     print "alt : <a href=\""leaf_id".cache.html\">"leaf_id".cache.html</a>" >> leaf_path;
  138.     print "</object>" >> leaf_path;
  139.     print "</body></html>" >> leaf_path;
  140.     close(leaf_path);
  141.     system("touch \""leaf_path".generated\"");
  142.     return 0;
  143.   };
  144.   return 0;
  145. }
  146. function bugfix_check_leaf_and_fix(id, ROOT , this,B,D,i ,r  ){
  147.   r=0;
  148.   grep("-o","href=[\"][^\"]*[\"]",""ROOT"/index.tmp",B);
  149.   for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
  150.   D[0]=0;agrep("","^/n[0-9]*[^/]*[/][0-9]*",B,D); #//collect data of leaf
  151.   for(i=1;i<=D[0];i++)D[i]=substr(D[i],2,length(D[i])-2);
  152.   # //D[x]=[n4092c/1]
  153.   for(i=1;i<=D[0];i++)sub("^"id"/","",D[i]);
  154.   # //D[x]=[1]
  155.   for(i=1;i<=D[0];i++){
  156.     if(system("[ -s \""ROOT"/"D[i]".html\" ]")){
  157.       # File is empty, or it does not exists, meaning leaf is missing
  158.       if(bugfix_try_to_get_leaf_from_anywhere(id, D[i], ROOT ,""ROOT"/"D[i]".html" )){
  159.         #_("Failed to obtain cache of ["id"] from anywhere...");
  160.         _log("Failed to obtain cache of ncode=["id"]leaf_id=["D[i]"] from anywhere...");
  161.         # indicate fail
  162.         r=r+1;
  163.       }
  164.       # Nothing we can do for now. we might try "redirect.txt" if it is available, this is TODO
  165.     }
  166.   };
  167.   #$ ROOT"/"D[i]".html" == ROOT"/1.html"
  168.   return r;
  169. }
  170. {
  171.   id=$1;
  172.   _("id=["id"]")
  173.   if(id!~"n[0-9]*"){
  174.    _("ID mismatch ["id"], skipping");
  175.    next;
  176.   };
  177.   if(system("[ -e \""R"/http/ncode.syosetu.com/"id"/complete.flag\" ]")==0){
  178.    _("["id"] aready exists, skipping");
  179.    next;
  180.   };
  181.   if(system("[ -e \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ]")==0){
  182.    _("["id"] is broken, patch unavailable, skipping");
  183.    next;
  184.   };
  185.   # Check for narou.dip.jp zip file
  186.   if(system("[ -s \""R"/http/ncode.syosetu.com/"id"/"id".zip\" ]")==0){
  187.     _("["id"] aready exists, data available as narou.dip.jp zip file. skipping");
  188.     system("touch \""R"/http/ncode.syosetu.com/"id"/complete.flag\" ");
  189.     next;
  190.   }
  191.   # Check for index.tmp, if available, perform leaf check
  192.   if(system("[ -s \""R"/http/ncode.syosetu.com/"id"/index.tmp\" ]")==0){
  193.     if(bugfix_check_leaf_and_fix(id,R"/http/ncode.syosetu.com/"id )){
  194.       _("["id"] does not have a valid leaf. Trying to repair node from cache");
  195.       _log("["id"] is broken, and we were unable to find any patch available on the internet...");
  196.       system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
  197.       next;
  198.     };
  199.   }
  200.   # we assume at this point, index.tmp is missing, meaning we do not have leaf index.
  201.   system("[ ! -d \""R"/http/ncode.syosetu.com/"id"\" ] && mkdir -p \""R"/http/ncode.syosetu.com/"id"\"");
  202.   out=R"/http/ncode.syosetu.com/"id"/index.tmp";
  203.   cmd("wget -S -U \"\" -O \""out"\" \"http://ncode.syosetu.com/"id"/\"",A);
  204.   if(A["RETURN_CODE"]==0){
  205.     # We have index.tmp , Try to obtain leaf idx
  206.     grep("-o","href=[\"][^\"]*[\"]", out ,B );for(i=1;i<=B[0];i++){B[i]=substr(B[i],7,length(B[i])-7);}
  207.     C[0]=0;agrep("","ncode.syosetu.com/n[0-9]",B,C); #//collect data of sub node
  208.     D[0]=0;agrep("","^/n[0-9]*[^/]*[/][0-9]*",B,D); #//collect data of leaf
  209.     # //C[x]= http://ncode.syosetu.com/nXXXXyy/
  210.     # //D[x]=[/n4092c/1/]
  211.     # Obtain leaf
  212.     for(i=1;i<=D[0];i++){
  213.       E[0]=split(D[i],E,"/");
  214.       sub_out=R"/http/ncode.syosetu.com/"id"/"E[3]".html"
  215.       cmd("wget -S -U \"\" -O \""sub_out"\" \"http://ncode.syosetu.com"D[i]"\"",A);
  216.       if(A["RETURN_CODE"]){
  217.         # We failed to get leaf. Maybe broken, or deleted while reading.. so we ask for cache, and dip server.
  218.         if(bugfix_try_to_get_leaf_from_anywhere(id, E[3], R"/http/ncode.syosetu.com/"id"" ,R"/http/ncode.syosetu.com/"id"/"E[3]".html" )){
  219.           # We failed. Nothing we can do.
  220.           _log("# Failed ncode=["id"]leaf_id=["E[3]"], reason=[patch not available anywhere] ");
  221.           system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
  222.           next;
  223.         }
  224.         # We assume leaf was somehow patched, or else we will not reach here
  225.       }
  226.       # move to next leaf...
  227.     }
  228.     # Add to waiting list
  229.     for(i=1;i<=C[0];i++){
  230.       print substr(C[i],26,length(C[i])-26) >> "wait.v2.txt";
  231.     }
  232.   }else{
  233.     #//2048 == ncode.syosetu.com gave 404 Error, try narou.dip.jp
  234.     index_path=R"/http/ncode.syosetu.com/"id"/index.tmp";
  235.     narou_dip_zip=R"/http/ncode.syosetu.com/"id"/"id".zip";
  236.     cmd("wget --header=\"Host: narou.dip.jp\" -S -U \"\" --post-data=\"ncode="id"\" -O \""narou_dip_zip"\" \"http://"IP_NAROU_DIP_JP"/download/send.php\"",A);
  237.     if(A["RETURN_CODE"]){
  238.       _log("# Error occoured while requesting from [narou.dip.jp]CODE=["A["RETURN_CODE"]"]");
  239.       _log("# Failed ncode=["id"], reason=[narou.dip.jp does not have data] ");
  240.       # try yahoo bing cache, and anything else possible
  241.       # if(bugfix_try_to_get_page_from_anywhere( "http://ncode.syosetu.com/"id"/" , R"/http/ncode.syosetu.com/"id"" , R"/http/ncode.syosetu.com/"id"/index.tmp" )){
  242.       #   _log("Failed to obtain cache of ncode=["id"]type=[index] from anywhere...");
  243.       #   next;
  244.       # }
  245.       # TODO - figure out a way to obtain good index file from yahoo bing cache... TODO
  246.       next;
  247.     }
  248.     # assume we have zip from narou.dip.jp
  249.     _("["id"] repaired, using narou.dip.jp cache");
  250.     system("unzip -p \""narou_dip_zip"\" \""toupper(id)"/"toupper(id)"-list.txt\" > \""index_path".txt\" ");
  251.     # Now patch up the index.tmp file, by rebuilding bare minimum
  252.     print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > index_path;
  253.     print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> index_path;
  254.     cmd("cat \""index_path".txt\"",A);
  255.     for(i=1;i<=A[0];i++){
  256.       if(A[i]!="")print "<a href=\"/"id"/"i"/\">"A[i]"</a><br>" >> index_path;
  257.     }
  258.     # ROOT/nXXXXyy/Z.html.txt
  259.     print "</body></html>" >> index_path;
  260.     close(index_path);
  261.     system("touch \""index_path".generated\"");
  262.     # build leaf redirect
  263.     for(i=1;i<=A[0];i++){
  264.       if(A[i]!=""){
  265.         if(bugfix_try_to_get_leaf_from_anywhere(id, i, R"/http/ncode.syosetu.com/"id"" ,R"/http/ncode.syosetu.com/"id"/"i".html" )){
  266.           # We failed. Nothing we can do.
  267.           _log("# Failed ncode=["id"]leaf_id=["i"], reason=[patch not available anywhere] ");
  268.           system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
  269.           next;
  270.         }
  271.       };
  272.     }
  273.     # hopefully, index.tmp (or a bare minimum bogus file with index.tmp.generated present is available
  274.     # we also assume leaf is built,
  275.   }
  276. }
  277. END{
  278.   if(dry_run) exit;
  279.   system("cat load.v2.txt >> done.v2.txt");
  280.   system("cat done.v2.txt |sort |uniq > done.v2.txt.new");
  281.   system("mv done.v2.txt.new done.v2.txt");
  282.   #system("cat wait.v2.txt |sort |uniq > wait.v2.txt.new");
  283.   system("mv wait.v2.txt load.v2.txt");
  284.   system("bash ./cache-generator.sh &");
  285. }'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement