Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 限定的にYahoo,bingキャッシュに対応してみたお。narou.dip.jpとncode.syosetu.comは404処理を少し改良。
- #google cacheは多くが流れが速いのとタイムスタンプでの処理判定ができないのでやめました。もう殆ど古いキャッシュ流れてるし。
- #後はnXXXXyy の形式で一行おきにload.v2.txtに記したリストを元に、自動で取得していきます。
- #dry_run=1で、「この小説をお気に入り登録している人はこんな小説も読んでいます」のリンクはダウンロードから除外されます。
- #通常はdry_run=0;なので、自分の好みの作品+それら作品から辿れる作品、の両方がダウンロードされる。
- #作:緑の狐 (c)2012 CC-SAライセンスです。改良したら公開してね!
- cat "load.v2.txt" | awk -v R="/cygdrive/c/archive/web/ncode-syosetu-com-lastday" '
- function __lib_math__hex_to_base10(s, this,r,i,A){
- r=0;A[0]=split(toupper(s),A,"");for(i=1;i<=A[0];i++)r=(r*16)+index("123456789ABCDEF",A[i]);return r;
- }
- function __lib_network__urldecode(s,this,A,i,r){r="";A[0]=split(s,A,"");for(i=1;i<=A[0];i++){if(A[i]=="%"){
- r=r""sprintf("%c",__lib_math__hex_to_base10(A[i+1]A[i+2]));i+=2;}else{r=r""A[i];}}return r;
- }
- BEGIN{
- dry_run=0;
- GLOBAL_FAKEUA="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.9) Gecko/20100317 SeaMonkey/2.0.4";
- LOAD="load.v2.txt";
- IP_NAROU_DIP_JP="125.202.136.68";
- }
- function _log(s){print s >> "error.log";print s;}
- function _(s){print s}
- function grep( o, re, p, B, this, a,v ){
- B[0]=0;if(o~"-v"){while((getline v < p)>0){if(!match(v,re))B[B[0]+=1]=v;}return B[0];};
- if(o~"-o"){while((getline v < p)>0){a=v;while(match(a,re)){B[B[0]+=1]=substr(a,RSTART,RLENGTH);
- a=substr(a,RSTART+RLENGTH);}};return B[0];};while((getline v < p)>0){if(match(v,re))B[B[0]+=1]=v;}return B[0];
- }
- function cmd( E, A, this,v){A[0]=0;while((E |getline v)>0)A[A[0]+=1]=v;A["RETURN_CODE"]=close(E);}
- function _realpath(p,tty, this,A){p=(p=="")?tty["pwd"]:p;cmd("cd \""tty["pwd"]"\" && realpath \""p"\"",A);return A[1];}
- function _pwd(tty){if(tty["shell"]=="")_virtual_console(tty);return tty["pwd"];}
- function _cd(p, tty, this,A,f){
- if(tty["shell"]=="")_virtual_console(tty);p=(p=="")?"~":p;f=(substr(p,1,1)=="/");
- if(system(((f)?"":"cd \""tty["pwd"]"\" && ")"[ -d \""p"\" ]")==0){
- cmd(((f)?"":"cd \""tty["pwd"]"\" && ")"cd \""p"\" && pwd",A);tty["pwd"]=A[1];}return tty["pwd"];
- }
- function _mkdir(p,tty, this,f){if(p=="")return 1;
- f=(substr(p,1,1)=="/");
- if(tty["shell"]=="")_virtual_console(tty);
- if(system(((f)?"":"cd \""tty["pwd"]"\" && ")"[ ! -e \""p"\" ]")==0){
- return system(((f)?"":"cd \""tty["pwd"]"\" && ")"mkdir \""p"\"");
- }else{
- }
- }
- function _virtual_console(tty, this){
- tty["shell"]="sh";
- tty["pwd"]="~";
- tty["pwd"]=_realpath("~",tty);
- }
- function agrep( o, re, A, B, this, a, i,k ){
- B[0]=0;k=0;if(o~"-v"){for(i=1;i<=A[0];i++){if(!match(A[i],re)) B[k+=1]=A[i];}B[0]=k;return k;};
- if(o~"-o"){for(i=1;i<=A[0];i++){a=A[i];while(match(a,re)){B[B[0]+=1]=substr(a,RSTART,RLENGTH);a=substr(a,RSTART+RLENGTH);};
- };B[0]=k;return k;};for(i=1;i<=A[0];i++){if(match(A[i],re))B[k+=1]=A[i];};B[0]=k;return k;
- }
- function dbg_printarray(ary , x , s,e, this , i ){x=(x=="")?"A":x;for(i=((s)?s:1);i<=((e)?e:ary[0]);i++){print x"["i"]=["ary[i]"]"}}
- function bugfix_try_to_get_page_from_anywhere( url , ROOT , file_path, this , A, fake , p ,B,C , i, x, fakename ){
- cmd("wget -S -U \"\" -O \""file_path"\" \""url"\"",A);
- if(A["RETURN_CODE"]){
- #// 2048 == 404
- # // try yahoo bing cache
- p=url;
- cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""file_path".search\" \"http://search.yahoo.com/search?p="p"&fr=sfp&fr2=&iscqry=\"",A);
- grep("-o" , "href=[\"]/[^\"]*[\"]" , file_path".search" , B);for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
- agrep( "" , "cache" , B, C);
- if(C[0]==0){
- _log("# Error : Failed to find yahoo bing cache. type=[no cache found] file_path=["file_path"] url=["url"]");
- return 1;
- }
- x=C[1];
- sub("^/[^*]*/[*][*]http%3a","http%3a",x);
- x=__lib_network__urldecode(x);
- cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""file_path".cache.html\" \""x"\"",A);
- if(A["RETURN_CODE"]){
- _log("# Error : Failed to dig up yahoo bing cache. file_path=["file_path"] url=["url"]");
- return 1;
- }
- # patch page to refer to cache
- #fakename = substr(file_path, match( file_path, "[^/]*$" ), RLENGTH);
- #print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > file_path;
- #print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> file_path;
- #print "<object data=\""fakename".cache.html\" type=\"text/html\" width=\"100%\" height=\"100%\">" >> file_path;
- #print "alt : <a href=\""fakename".cache.html\">"fakename".cache.html</a>" >> file_path;
- #print "</object>" >> file_path;
- #print "</body></html>" >> file_path;
- #close(file_path);
- system("touch \""file_path".generated\"");
- system("mv \""file_path".cache.html\" \""file_path"\"");
- return 0;
- }
- return 0;
- }
- function bugfix_try_to_get_leaf_from_anywhere(id, leaf_id , ROOT , leaf_path, this , A, fake , p ,B,C , i, url , fakename ){
- cmd("wget -S -U \"\" -O \""leaf_path"\" \"http://ncode.syosetu.com/"id"/"leaf_id"/\"",A);
- if(A["RETURN_CODE"]==2048){
- # try narou.dip.jp
- fake=""R"/http/ncode.syosetu.com/"id"/"id".zip";
- A["RETURN_CODE"]=0;
- if(system("[ -s \""fake"\" ]")){
- # we do not have zip file, try to download
- cmd("wget --header=\"Host: narou.dip.jp\" -S -U \"\" --post-data=\"ncode="id"\" -O \""fake"\" \"http://"IP_NAROU_DIP_JP"/download/send.php\"",A);
- };
- if(A["RETURN_CODE"]==0){
- _("["id"] repaired, using narou.dip.jp cache");
- if(system("unzip -p \""fake"\" \""toupper(id)"/"toupper(id)"-"leaf_id".txt\" > \""leaf_path".txt\" ")==0){
- fakename = substr(leaf_path, match( leaf_path, "[^/]*$" ), RLENGTH);
- # Now patch up the leaf
- print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > leaf_path;
- print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> leaf_path;
- print "<object data=\""fakename".txt\" type=\"text/plain\" width=\"100%\" height=\"100%\">" >> leaf_path;
- print "alt : <a href=\""fakename".txt\">"fakename".txt</a>" >> leaf_path;
- print "</object>" >> leaf_path;
- print "</body></html>" >> leaf_path;
- close(leaf_path);
- system("touch \""leaf_path".generated\"");
- return 0;
- }
- # seems like unzip failed
- # commented out, as we added yahoo cache as alternative source
- #return 1;
- }
- # // try yahoo bing cache
- p="http://ncode.syosetu.com/"id"/"((leaf_id)?""leaf_id"/":"");
- cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""leaf_path".search\" \"http://search.yahoo.com/search?p="p"&fr=sfp&fr2=&iscqry=\"",A);
- grep("-o","href=[\"]/[^\"]*[\"]",leaf_path".search",B);for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
- agrep( "", "cache", B, C);
- url=C[1];
- sub("^/[^*]*/[*][*]http%3a","http%3a",url);
- url=__lib_network__urldecode(url);
- cmd("wget -S -U \""GLOBAL_FAKEUA"\" -O \""leaf_path".cache.html\" \""url"\"",A);
- if(A["RETURN_CODE"]){
- _("# Error : Failed to dig up yahoo bing cache file id=["id"]leaf_id=["leaf_id"]url=["url"]");
- return 1;
- }
- # patch leaf to refer to cache
- print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > leaf_path;
- print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> leaf_path;
- print "<object data=\""leaf_id".cache.html\" type=\"text/html\" width=\"100%\" height=\"100%\">" >> leaf_path;
- print "alt : <a href=\""leaf_id".cache.html\">"leaf_id".cache.html</a>" >> leaf_path;
- print "</object>" >> leaf_path;
- print "</body></html>" >> leaf_path;
- close(leaf_path);
- system("touch \""leaf_path".generated\"");
- return 0;
- };
- return 0;
- }
- function bugfix_check_leaf_and_fix(id, ROOT , this,B,D,i ,r ){
- r=0;
- grep("-o","href=[\"][^\"]*[\"]",""ROOT"/index.tmp",B);
- for(i=1;i<=B[0];i++)B[i]=substr(B[i],7,length(B[i])-7);
- D[0]=0;agrep("","^/n[0-9]*[^/]*[/][0-9]*",B,D); #//collect data of leaf
- for(i=1;i<=D[0];i++)D[i]=substr(D[i],2,length(D[i])-2);
- # //D[x]=[n4092c/1]
- for(i=1;i<=D[0];i++)sub("^"id"/","",D[i]);
- # //D[x]=[1]
- for(i=1;i<=D[0];i++){
- if(system("[ -s \""ROOT"/"D[i]".html\" ]")){
- # File is empty, or it does not exists, meaning leaf is missing
- if(bugfix_try_to_get_leaf_from_anywhere(id, D[i], ROOT ,""ROOT"/"D[i]".html" )){
- #_("Failed to obtain cache of ["id"] from anywhere...");
- _log("Failed to obtain cache of ncode=["id"]leaf_id=["D[i]"] from anywhere...");
- # indicate fail
- r=r+1;
- }
- # Nothing we can do for now. we might try "redirect.txt" if it is available, this is TODO
- }
- };
- #$ ROOT"/"D[i]".html" == ROOT"/1.html"
- return r;
- }
- {
- id=$1;
- _("id=["id"]")
- if(id!~"n[0-9]*"){
- _("ID mismatch ["id"], skipping");
- next;
- };
- if(system("[ -e \""R"/http/ncode.syosetu.com/"id"/complete.flag\" ]")==0){
- _("["id"] aready exists, skipping");
- next;
- };
- if(system("[ -e \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ]")==0){
- _("["id"] is broken, patch unavailable, skipping");
- next;
- };
- # Check for narou.dip.jp zip file
- if(system("[ -s \""R"/http/ncode.syosetu.com/"id"/"id".zip\" ]")==0){
- _("["id"] aready exists, data available as narou.dip.jp zip file. skipping");
- system("touch \""R"/http/ncode.syosetu.com/"id"/complete.flag\" ");
- next;
- }
- # Check for index.tmp, if available, perform leaf check
- if(system("[ -s \""R"/http/ncode.syosetu.com/"id"/index.tmp\" ]")==0){
- if(bugfix_check_leaf_and_fix(id,R"/http/ncode.syosetu.com/"id )){
- _("["id"] does not have a valid leaf. Trying to repair node from cache");
- _log("["id"] is broken, and we were unable to find any patch available on the internet...");
- system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
- next;
- };
- }
- # we assume at this point, index.tmp is missing, meaning we do not have leaf index.
- system("[ ! -d \""R"/http/ncode.syosetu.com/"id"\" ] && mkdir -p \""R"/http/ncode.syosetu.com/"id"\"");
- out=R"/http/ncode.syosetu.com/"id"/index.tmp";
- cmd("wget -S -U \"\" -O \""out"\" \"http://ncode.syosetu.com/"id"/\"",A);
- if(A["RETURN_CODE"]==0){
- # We have index.tmp , Try to obtain leaf idx
- grep("-o","href=[\"][^\"]*[\"]", out ,B );for(i=1;i<=B[0];i++){B[i]=substr(B[i],7,length(B[i])-7);}
- C[0]=0;agrep("","ncode.syosetu.com/n[0-9]",B,C); #//collect data of sub node
- D[0]=0;agrep("","^/n[0-9]*[^/]*[/][0-9]*",B,D); #//collect data of leaf
- # //C[x]= http://ncode.syosetu.com/nXXXXyy/
- # //D[x]=[/n4092c/1/]
- # Obtain leaf
- for(i=1;i<=D[0];i++){
- E[0]=split(D[i],E,"/");
- sub_out=R"/http/ncode.syosetu.com/"id"/"E[3]".html"
- cmd("wget -S -U \"\" -O \""sub_out"\" \"http://ncode.syosetu.com"D[i]"\"",A);
- if(A["RETURN_CODE"]){
- # We failed to get leaf. Maybe broken, or deleted while reading.. so we ask for cache, and dip server.
- if(bugfix_try_to_get_leaf_from_anywhere(id, E[3], R"/http/ncode.syosetu.com/"id"" ,R"/http/ncode.syosetu.com/"id"/"E[3]".html" )){
- # We failed. Nothing we can do.
- _log("# Failed ncode=["id"]leaf_id=["E[3]"], reason=[patch not available anywhere] ");
- system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
- next;
- }
- # We assume leaf was somehow patched, or else we will not reach here
- }
- # move to next leaf...
- }
- # Add to waiting list
- for(i=1;i<=C[0];i++){
- print substr(C[i],26,length(C[i])-26) >> "wait.v2.txt";
- }
- }else{
- #//2048 == ncode.syosetu.com gave 404 Error, try narou.dip.jp
- index_path=R"/http/ncode.syosetu.com/"id"/index.tmp";
- narou_dip_zip=R"/http/ncode.syosetu.com/"id"/"id".zip";
- cmd("wget --header=\"Host: narou.dip.jp\" -S -U \"\" --post-data=\"ncode="id"\" -O \""narou_dip_zip"\" \"http://"IP_NAROU_DIP_JP"/download/send.php\"",A);
- if(A["RETURN_CODE"]){
- _log("# Error occoured while requesting from [narou.dip.jp]CODE=["A["RETURN_CODE"]"]");
- _log("# Failed ncode=["id"], reason=[narou.dip.jp does not have data] ");
- # try yahoo bing cache, and anything else possible
- # if(bugfix_try_to_get_page_from_anywhere( "http://ncode.syosetu.com/"id"/" , R"/http/ncode.syosetu.com/"id"" , R"/http/ncode.syosetu.com/"id"/index.tmp" )){
- # _log("Failed to obtain cache of ncode=["id"]type=[index] from anywhere...");
- # next;
- # }
- # TODO - figure out a way to obtain good index file from yahoo bing cache... TODO
- next;
- }
- # assume we have zip from narou.dip.jp
- _("["id"] repaired, using narou.dip.jp cache");
- system("unzip -p \""narou_dip_zip"\" \""toupper(id)"/"toupper(id)"-list.txt\" > \""index_path".txt\" ");
- # Now patch up the index.tmp file, by rebuilding bare minimum
- print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" > index_path;
- print "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" >> index_path;
- cmd("cat \""index_path".txt\"",A);
- for(i=1;i<=A[0];i++){
- if(A[i]!="")print "<a href=\"/"id"/"i"/\">"A[i]"</a><br>" >> index_path;
- }
- # ROOT/nXXXXyy/Z.html.txt
- print "</body></html>" >> index_path;
- close(index_path);
- system("touch \""index_path".generated\"");
- # build leaf redirect
- for(i=1;i<=A[0];i++){
- if(A[i]!=""){
- if(bugfix_try_to_get_leaf_from_anywhere(id, i, R"/http/ncode.syosetu.com/"id"" ,R"/http/ncode.syosetu.com/"id"/"i".html" )){
- # We failed. Nothing we can do.
- _log("# Failed ncode=["id"]leaf_id=["i"], reason=[patch not available anywhere] ");
- system("touch \""R"/http/ncode.syosetu.com/"id"/broken.flag\" ");
- next;
- }
- };
- }
- # hopefully, index.tmp (or a bare minimum bogus file with index.tmp.generated present is available
- # we also assume leaf is built,
- }
- }
- END{
- if(dry_run) exit;
- system("cat load.v2.txt >> done.v2.txt");
- system("cat done.v2.txt |sort |uniq > done.v2.txt.new");
- system("mv done.v2.txt.new done.v2.txt");
- #system("cat wait.v2.txt |sort |uniq > wait.v2.txt.new");
- system("mv wait.v2.txt load.v2.txt");
- system("bash ./cache-generator.sh &");
- }'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement