Advertisement
s243a

PT Bookmark Parser Ver 5.0

Dec 14th, 2018
276
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.48 KB | None | 0 0
  1. import sgmllib, urllib, urlparse, os, pickle, md5, base64
  2. import pdb, json #For debugging
  3. #https://stackoverflow.com/questions/3031045/how-come-string-maketrans-does-not-work-in-python-3-1
  4. from string import maketrans #Import might be slightly different in python 3
  5.  
  6. #https://stackoverflow.com/questions/9698614/super-raises-typeerror-must-be-type-not-classobj-for-new-style-class
  7. #https://stackoverflow.com/questions/9699591/instance-is-an-object-but-class-is-not-a-subclass-of-object-how-is-this-po/9699961#9699961
  8.  
  9. # https://linuxconfig.org/list-of-python-escape-sequence-characters-with-examples
  10. bashEncoding={"\a",r"\a", # alert (bell)
  11.               "\b",r"\b", # backspace
  12.  
  13.               "\f",r"\f", # form feed
  14.               "\n",r"\n", # new line
  15.               "\r",r"\r", # carriage return
  16.               "\t",r"\t", # horizontal tab
  17.               "\\",r"\\", # backslash
  18.               "\'",r"\'", # single quote
  19.               '\"',r'\"', # double quot#
  20.               "?",r"\?"}  # question mark
  21. def bashStrQuote(string,*unsafe):
  22.     out=BashString([])
  23.     for c in string:
  24.         out.append(bashChrQuote(c,unsafe))
  25.     return out
  26. def bashChrQuote(char,*unsafe):
  27.     if (len(unsafe)==0) or (char in unsafe):
  28.         if char in bashEnconding:
  29.             return bashEncoding[char]
  30.         else:
  31.             return char
  32.     else:
  33.         return char
  34. #              "\v",r"\v", # vertical tab        
  35. #              "\e",r"\e", #
  36. #              "\E",r"\E"' # an escape character
  37. #    \nnn   the eight-bit character whose value is the octal
  38. #           value nnn (one to three digits)
  39. #    \xHH   the eight-bit character whose value is the hexadecimal
  40. #           value HH (one or two hex digits)
  41. #    \uHHHH the Unicode (ISO/IEC 10646) character whose value is
  42. #           the hexadecimal value HHHH (one to four hex digits)
  43. #    \UHHHHHHHH the Unicode (ISO/IEC 10646) character whose value
  44. #               is the hexadecimal value HHHHHHHH (one to eight
  45. #               hex digits)
  46. #    \cx    a control-x character
  47. class BashString:
  48.     def __init__(self,string):
  49.         self.chars=[]
  50.         for c in string:
  51.             self.chars.append(string)
  52.     def append(self,c):
  53.         self.chars.append(c)
  54.     def __str__(self):
  55.         return ''.join(self.chars) #https://stackoverflow.com/questions/4435169/how-do-i-append-one-string-to-another-in-python
  56.                
  57. class SimplePathWatcher(object):
  58.     def __init__(self,root):
  59.         self.root=root
  60.         self.raw_data=None
  61.     def on_header(self,raw_data,obj=None,state=None):
  62.         if state=="More":
  63.             self.raw_data=self.raw_data+raw_data
  64.         else:
  65.             self.raw_data=raw_data
  66.     def on_enter(self):
  67.         pass
  68.     def on_exit(self):
  69.         pass
  70.     def get_Dir():
  71.         self.items[-1]['full']
  72. class DictList(list):
  73.     def __init__(self,*args):
  74.         super(DictList,self).__init__(args)
  75.     def append(self,**kw):
  76.         super(DictList,self).append(kw)
  77. alt_fn=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else a()
  78. def alt(d,k,a):
  79.     d[k]=alt_fn(d,k,a)
  80. intab = " _"
  81. outtab = "_~"
  82. transtab = maketrans(intab, outtab)
  83. def seq0(*args):
  84.     for arg in args:
  85.         arg()
  86.        
  87.          
  88. def setPathDefaults(obj):
  89.         obj.sep='/'
  90.         obj.MAX_PATH_LEN=200#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  91.         obj.MAX_NESTING=0 #0 Means no limit on the amount of nexted directories.    
  92. def getCallDel(kw,key,default,args):
  93.     fn=kw.get(key,default)
  94.     if fn is not None:
  95.         return fn(args)
  96.         del kw[key]
  97. def setKwAttr(obj,kw):
  98.     print(kw)
  99.     #https://stackoverflow.com/questions/5466618/too-many-values-to-unpack-iterating-over-a-dict-key-string-value-list
  100.     if hasattr(kw,'iteritems'): #Python 2 case
  101.         itterator=lambda kw: kw.iteritems()
  102.     else: #Python 3 case
  103.         itterator=lambda kw: kw.items()
  104.     print("kw="+str(kw))
  105.     for k,v in itterator(kw):
  106.         print("obj."+str(k)+"="+str(v))
  107.         setattr(obj,k,v)
  108.     #pdb.set_trace()
  109. def hasherDefulats():
  110.   return {\
  111.     'hasher':    md5.md5(),
  112.     'updateHash':lambda hasher,data: seq0(lambda: hasher.update("\n"),
  113.                                           lambda: hasher.update(data)),
  114.     'getDigest': lambda hasher: base64.b64encode(hasher.digest(),'~-')[0:22],
  115.     'copyHash':  lambda hasher: hasher.copy()}
  116.     #    
  117. def setHashWrapper(obj,kw,delete=True):
  118.     defaults=hasherDefulats()
  119.     for key in ('hasher','updateHash','getDigest','copyHash'):
  120.         val=kw.get(key,None)
  121.         if val is not None:
  122.             setattr(obj,key,val)
  123.             if delete==True:
  124.                 del kw[key]
  125.         else:
  126.             setattr(obj,key,defaults[key])
  127. class HashWrapper:
  128.     def __init__(self,kw,delete=True):
  129.         setHashWrapper(self,kw,delete=True)
  130.     def update(self,data):
  131.         return self.updateHash(self.hasher,data)
  132.     def digest(self):
  133.         return self.getDigest(self.hasher)
  134.     def copy(self):
  135.         kw={'hasher':self.copyHash(self.hasher),
  136.             'updateHash':self.updateHash,
  137.             'getDigest':self.getDigest,
  138.             'copyHash':self.copyHash}
  139.         return HashWrapper(kw)
  140. class DictList_For_EncodedPathWatcher(DictList):
  141.     def __init__(self,paths=[],**kw):
  142.         super(DictList,self).__init__(paths)
  143.         #setHasher(self)
  144.         self.hasher=HashWrapper(kw,delete=True)
  145.         if 'encoder' in kw.keys():
  146.             self.encoder=kw['encoder']
  147.         else:
  148.             self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='()?-,'+r"'"+r'"') #+"'"+'"'      
  149.         #getCallDel(kw,'setHasher',setHasher,self)
  150.         self.before_append=kw.get('before_append',lambda path: None)
  151.         self.after_append=kw.get('after_append',lambda path: None)  
  152.         self.before_pop=kw.get('before_pop',lambda path: None)
  153.         self.after_pop=kw.get('after_pop',lambda path: None)    
  154.         self.root=kw.get('root',"/root/Downloads/pt")    
  155.  
  156.         getCallDel(kw,'setPathDefaults',setPathDefaults,self)
  157.  
  158.         setKwAttr(self,kw)      
  159.         for path in paths:
  160.             self.append(path)
  161.     def append(self,raw_name,**kw):
  162.         #alt(kw,'bname',self.updateBName(kw)) I think we need this
  163.         if raw_name is not None:
  164.             kw['raw_name']=raw_name  
  165.        
  166.         print("raw_name"+str(raw_name))
  167.  
  168.         alt(kw,'bname',lambda: self.encoder(kw['raw_name']))
  169.         if len(self)>0:
  170.             hasher=self[-1]['hasher'].copy()
  171.             alt(kw,'hasher',lambda: hasher)        
  172.             alt(kw,'nesting',lambda: self[-1]['nesting']+1)        
  173.             alt(kw,'full',lambda: self[-1]['full']+self.sep+kw['bname'])
  174.         else:
  175.             hasher=self.hasher.copy()
  176.             alt(kw,'hasher',lambda: hasher)        
  177.             alt(kw,'nesting',lambda: 1)        
  178.             alt(kw,'full',lambda: self.root+self.sep+kw['bname'])      
  179.         #if kw['nesting']>1:
  180.         folded=self.pathFold(kw)
  181.         #else
  182.         #    folded=False
  183.         if len(self)>0:
  184.             self.before_append(self[-1]['full'])
  185.         else:
  186.             pass #TODO, need to think about this
  187.         super(DictList,self).append(kw)
  188.         if folded:
  189.             self.mkLinks()  
  190.         self.after_append(self[-1]['full'])
  191.     def rename(ind=-1,**kw):
  192.         for key,value in kw:
  193.             self[-1][key]=value
  194.         #TODO, add logic for if one gives the bname here without the fullname or visa-versa
  195.     def pop(self):
  196.         last=self[-1]
  197.         self.before_pop(last['full'],self) #No point in return a result.
  198.         super(DictList,self).pop()
  199.        
  200.         if len(self)>0:
  201.             path=self[-1]['full']
  202.             self.after_pop(path,last)  
  203.         else: #This is probably pointless
  204.             path=self.root
  205.        
  206.         #return result #I don't think this does anytning useful.
  207.     def getFullname():
  208.         return self[-1].full                      
  209.     def pathFold(self,kw):
  210.         maxNesting=kw.get('maxNesting',self.MAX_NESTING)
  211.         root=kw.get('root',self.root)
  212.  
  213.         path=kw['full']
  214.         pathlen=len(path)
  215.         nesting=kw['nesting']
  216.         if (nesting<=maxNesting or maxNesting==0) and \
  217.            (pathlen<=self.MAX_PATH_LEN):
  218.             return False #No Folding Required.
  219.         else:
  220.             bname=kw['bname']
  221.             hasher=kw['hasher']
  222.             kw['bname']=bname+"-id"+hasher.digest()
  223.             kw['full']=self.root+self.sep+'1'+self.sep+kw['bname'] #TODO give more options for the wrap folder
  224.             return True
  225.             #mkLinks=kw.get('mkLinks',self.mkLinks)
  226.             #mkLinks(paths)  
  227.     def mkLinks(self):#new_fullpath,last_fullpath):
  228.            #dig=m.digest()
  229.            #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
  230.            #self.mkLinks(last.full,dig,dirname)
  231.            
  232.            new_fullpath = self[-1]['full']
  233.            last_fullpath=self[-2]['full']
  234.            bname=self[-1]['bname']
  235.            sep=self.sep
  236.            if not os.path.exists(new_fullpath):
  237.              print("Making directory "+new_fullpath)
  238.              os.makedirs(new_fullpath)
  239.            #TODO Possible bug producting duplicate paths
  240.            new_fullpath2=new_fullpath.replace("'",r"'\''")    #str(bashStrQuote(new_fullpath,"\'"))
  241.            last_fullpath2=last_fullpath.replace("'",r"'\''")   #str(bashStrQuote(last_fullpath,"\'"))
  242.            if not os.path.exists(new_fullpath+sep+"parent"):
  243.              last_fullpath_rel=os.popen("realpath --relative-to='"+new_fullpath2+"' '"\
  244.                                                                   +last_fullpath2+"'"\
  245.                                        ).read().rstrip('\n')
  246.              os.chdir(last_fullpath)    
  247.              print("last_fullpath="+last_fullpath2)
  248.              print("new_fullpath="+new_fullpath2)
  249.              print("last_fullpath_rel"+last_fullpath_rel)          
  250.              os.symlink(last_fullpath_rel,new_fullpath+sep+"parent")
  251.            print(str(last_fullpath+sep+bname))
  252.            if not os.path.exists(last_fullpath+sep+bname):
  253.              last_fullpath2=last_fullpath.replace("'",r"'\''")  #str(bashStrQuote(last_fullpath,"\'"))
  254.              new_fullpath2=new_fullpath.replace("'",r"'\''")    #str(bashStrQuote(new_fullpath,"\'"))  
  255.              new_fullpath_rel=os.popen("realpath --relative-to='"+last_fullpath2+"' '"\
  256.                                                                  +new_fullpath2+"'"\
  257.                                       ).read().rstrip('\n')
  258.              os.chdir(new_fullpath)    
  259.              os.symlink(new_fullpath_rel,last_fullpath+sep+bname)
  260.            #paths.rename(-1,bname=bname,full=new_fullpath)  
  261. #    def set_HashFn(hasher=None,updateHash=None,getDigest=None):
  262. #        if hasher is not None: self.hasher=hasher
  263. #        if updateHash is not None: self.updateHash=updateHash
  264. #        if getDigest is not None: self.getDigest=getDigest
  265. #        if copyHash is not None: self.copyHash=copyHash
  266. #        return self
  267.  
  268.  
  269. class EncodedPathWatcher(SimplePathWatcher):
  270.     def __init__(self,root,**kw):
  271.         #https://stackoverflow.com/questions/11179008/python-inheritance-typeerror-object-init-takes-no-parameters
  272.         super(EncodedPathWatcher,self).__init__(root)    
  273.         self.paths=DictList_For_EncodedPathWatcher([],**kw)
  274.     def on_enter(self,raw_data=None,bname=None,items=None,obj=None):
  275.         if raw_data is None:
  276.             raw_data=self.raw_data
  277.         self.paths.append(raw_data)
  278.     #def mkLinks(self,last,dirname,m)
  279.  
  280.     def on_exit(self):
  281.         self.paths.pop()
  282.     def encode(self,bname):
  283.         return self.encoder(self.bname,self)
  284.     def defaultEncoder(self,bname):
  285.         return urllib.quote(self.bname, safe='()?-,'+r"'"+r'"') #+"'"+'"'
  286.     #def defaultSymLast(
  287. def storeList(path,obj):
  288.         items=obj.items
  289.         print('storeList')
  290.         os.chdir(str(path))
  291.         print(os.getcwd())
  292.         pickle.dump(items,open( "list.p", "wb" ))
  293.         bname=os.path.basename(path)
  294.         if bname in ():
  295.             print(json.dumps(items,default=jdefault))
  296.             print("store at "+str(path))
  297.             pdb.set_trace()
  298.         obj.items=[]
  299. def mkdir(path):
  300.     if not os.path.exists(str(path)):
  301.         print("Making directory "+str(path))
  302.         os.makedirs(str(path))  
  303. #https://pythontips.com/2013/08/08/storing-and-loading-data-with-json/        
  304. def jdefault(o):
  305.     return o.__dict__        
  306. def loadList(path,obj):
  307.     items=obj.items
  308.     print('loadList')
  309.     print('path='+str(path))
  310.     os.chdir(str(path))
  311.     print(str(path))
  312.     print(os.getcwd())
  313.    
  314.     obj.items=pickle.load(open( "list.p", "rb" ))  
  315.     print(json.dumps(items,default=jdefault))  
  316.     #pdb.set_trace()
  317. class HTML_Link:
  318.     def __init__(self,**kw):
  319.         self.text=kw.get("text"," ")
  320.         self.href=kw.get("href",'/')
  321.         self.add_date=kw.get("add_date",'0')
  322.         self.linkType=kw.get("linkType",None)
  323.  
  324.     def toHTML(self,endSep=''):
  325.        
  326.         if self.linkType is None:
  327.             str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"+endSep
  328.             return str
  329.         elif self.linkType.upper()=="FOLDER":
  330.             str='<A HREF="'+self.href+'">'+self.text+"</A>"          
  331.             return 'Folder: '+str+endSep
  332. class Section:
  333.     def __init__(self,text):
  334.         self.text=text.replace('\n', ' ')
  335.     def toHTML(self,endSep=''):
  336.         return "<H3>"+self.text+"</H3><hr>"
  337. def linkToLastHTML(obj,path,last,paths=None):
  338.     if paths is not None:
  339.         sep=paths.sep
  340.     else:
  341.         sep="/"
  342.     items=obj.items
  343.     last_path=last['full']
  344.     #TODO improve security here and make the code more portable.
  345.     href=urllib.quote(os.popen("realpath --relative-to='"+str(bashStrQuote(path,"\'"))+"' '"\
  346.                                                          +str(bashStrQuote(last_path,"\'"))+"'"\
  347.                               ).read().rstrip('\n'))
  348.     href=href+sep+"index.html"
  349.     print("href="+href)
  350.     text=last['raw_name']
  351.     linkType='FOLDER'
  352.     items.append(HTML_Link(href=href,text=text,linkType=linkType))
  353.     print(items[-1].toHTML())
  354.     #pdb.set_trace()
  355. class BookMarkParser(sgmllib.SGMLParser):
  356.     def __init__(self,**kw):
  357.         sgmllib.SGMLParser.__init__(self)
  358.         self.STATE="__init__"
  359.         self.items=[]
  360.         self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
  361.  
  362.         if self.watcher is None:
  363.             self.watcher=EncodedPathWatcher(root="/root/Downloads/pt",
  364.                 before_append=lambda path: storeList(path,self),
  365.                 after_append=lambda path: mkdir(path),
  366.                 before_pop=lambda path,paths: HTMLWriter(self,paths).writeList(path),
  367.                 after_pop=lambda path,last: \
  368.                             seq0(lambda: loadList(path,self),
  369.                                  lambda: linkToLastHTML(self,path,last)
  370.                                 )                
  371.             )
  372.             #self.watcher.enterCB=lambda dir: self.storeList(dir)
  373.         self.watchers=[self.watcher]        
  374.        
  375.  
  376.     def NotifyHeaderWatchers(self,data):
  377.         for aWatcher in self.watchers:
  378.             aWatcher.on_header(data,obj=self)
  379.  
  380.     def start_h3(self, attributes):
  381.         print('start_H3')
  382.         self.STATE='Started H3'
  383.         for name, value in attributes:
  384.             print(name+"="+value)
  385.             if (value == 'FOLDED') or (name == 'folded'):
  386.                 self.STATE='FOLDED'
  387.     def handle_data(self,data):
  388.         print('handleData')
  389.         print("self.STATE="+self.STATE)
  390.         if self.STATE=='FOLDED':
  391.             self.NotifyHeaderWatchers(data)
  392.  
  393.             #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
  394.  
  395.             self.STATE="Seeking DL"
  396.  
  397.         if self.STATE=="A":
  398.            
  399.             self.A.text=data
  400.         if self.STATE=="DD":
  401.             data2=data.split('\n')[0].strip()
  402.             if len(data2)>0:
  403.                 self.items.append(Section(data2))
  404.     def end_h3(self): #Probably redundant
  405.         print('end_H3')
  406.         self.FOLDED=False
  407.     def start_dl(self, atributes):
  408.         if self.STATE!="__init__":
  409.             print('start_DL')
  410.             for watcher in self.watchers:
  411.                 watcher.on_enter()  
  412.             self.items=[]
  413.     def end_dl(self):
  414.         if self.STATE!="__init__":
  415.             print('end_DL')
  416.             for watcher in self.watchers:
  417.                 watcher.on_exit()          
  418.     def start_a(self,atributes):
  419.         if self.STATE!="__init__":
  420.             print('start_A')
  421.             self.A=HTML_Link()
  422.             for key,value in atributes:
  423.                 setattr(self.A,key,value)
  424.         self.STATE='A'
  425.     def end_a(self):
  426.         if self.STATE!="__init__":
  427.             print('end_A')
  428.             self.items.append(self.A)
  429.             self.A=None
  430.             self.STATE='Ended A'
  431.     def do_dd(self, atributes):
  432.         if self.STATE!="__init__":
  433.             print('do_DD')
  434.             self.STATE="DD"
  435.     def do_dt(self, atributes):
  436.         if self.STATE!="__init__":
  437.             print('do_DT')
  438.             self.STATE="DT"
  439. class HTMLWriter:
  440.     def __init__(self,obj,paths=None,maxParents=0):
  441.         self.items=obj.items
  442.         self.paths=paths
  443.         self.maxParents=0
  444.     def writeHTMLHeader(self,f):
  445.         print('WriteHTMLHeader')
  446.         f.write("%s\n" % '<!DOCTYPE html>')
  447.         f.write("%s\n" % '<html>')
  448.         f.write("%s\n" % '<body>')
  449.     def writePath(self,f):
  450.         f.write("<b>Reverse Path:</b> ")
  451.         count=1
  452.         paths=self.paths
  453.         f.write('<a href=".">'+paths[-1]['bname']+"</a>"+" | ")
  454.         href=".."
  455.         for i in range(len(paths)-2,-1,-1):
  456.             f.write('<a href="'+href+'/index.html">'+paths[i]['bname']+"</a>")
  457.             count=count+1
  458.             if (count>= self.maxParents) and (self.maxParents!=0):
  459.                 f.write("<br>")
  460.                 break
  461.             else:
  462.                 href=href+"/.." #TODO make this more platform independent
  463.                 f.write(" |")
  464.         f.write("<hr><br>")
  465.     def writeHTMLFooter(self,f):
  466.         print('writeHTMLFooter')
  467.         f.write("%s\n" % '</body>')        
  468.         f.write("%s\n" % '</html>')  
  469.     def writeList(self,path):
  470.         print('writeList')
  471.         os.chdir(str(path))
  472.         bname=os.path.basename(path)
  473.         if bname in ():
  474.             print("writeList "+bname)
  475.             print(json.dumps(self.items,default=jdefault))  
  476.             pdb.set_trace()
  477.             debug_break=True
  478.         else:
  479.             debug_break=False
  480.         with open('index.html', 'w') as f:
  481.             self.writeHTMLHeader(f)
  482.             self.writePath(f)
  483.             for item in self.items:
  484.                 if item is not None:
  485.                     out_str="%s\n" % item.toHTML('<br>')                        
  486.                 else:
  487.                     out_str="<b>Empty Item!!!!</b>"
  488.                 f.write(out_str)
  489.                 if debug_break:
  490.                     print("out_str="+out_str)
  491.                     pdb.set_trace()
  492.             self.writeHTMLFooter(f)        
  493.        
  494. p = BookMarkParser()
  495. filename='/root/Downloads/pearltrees_export.html'
  496. f = open(filename, "r")
  497. BUFSIZE = 8192
  498. while True:
  499.     #data = f.read(BUFSIZE)
  500.     data=f.readline()
  501.     print('data='+str(data))
  502.     if not data: break
  503.     p.feed(data)
  504. p.close(  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement