Advertisement
s243a

PT Bookmark Parser Ver 4.0

Dec 12th, 2018
334
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 16.28 KB | None | 0 0
  1. import sgmllib, urllib, urlparse, os, pickle, md5, base64
  2. import pdb, json #For debugging
  3. #https://stackoverflow.com/questions/3031045/how-come-string-maketrans-does-not-work-in-python-3-1
  4. from string import maketrans #Import might be slightly different in python 3
  5.  
  6. #https://stackoverflow.com/questions/9698614/super-raises-typeerror-must-be-type-not-classobj-for-new-style-class
  7. #https://stackoverflow.com/questions/9699591/instance-is-an-object-but-class-is-not-a-subclass-of-object-how-is-this-po/9699961#9699961
  8. class SimplePathWatcher(object):
  9.     def __init__(self,root):
  10.         self.root=root
  11.         self.raw_data=None
  12.     def on_header(self,raw_data,obj=None,state=None):
  13.         if state=="More":
  14.             self.raw_data=self.raw_data+raw_data
  15.         else:
  16.             self.raw_data=raw_data
  17.     def on_enter(self):
  18.         pass
  19.     def on_exit(self):
  20.         pass
  21.     def get_Dir():
  22.         self.items[-1]['full']
  23. class DictList(list):
  24.     def __init__(self,*args):
  25.         super(DictList,self).__init__(args)
  26.     def append(self,**kw):
  27.         super(DictList,self).append(kw)
  28. alt_fn=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else a()
  29. def alt(d,k,a):
  30.     d[k]=alt_fn(d,k,a)
  31. intab = " _"
  32. outtab = "_~"
  33. transtab = maketrans(intab, outtab)
  34. def seq0(*args):
  35.     for arg in args:
  36.         arg()
  37.        
  38.          
  39. def setPathDefaults(obj):
  40.         obj.sep='/'
  41.         obj.MAX_PATH_LEN=200#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  42.         obj.MAX_NESTING=0 #0 Means no limit on the amount of nexted directories.    
  43. def getCallDel(kw,key,default,args):
  44.     fn=kw.get(key,default)
  45.     if fn is not None:
  46.         return fn(args)
  47.         del kw[key]
  48. def setKwAttr(obj,kw):
  49.     print(kw)
  50.     #https://stackoverflow.com/questions/5466618/too-many-values-to-unpack-iterating-over-a-dict-key-string-value-list
  51.     if hasattr(kw,'iteritems'): #Python 2 case
  52.         itterator=lambda kw: kw.iteritems()
  53.     else: #Python 3 case
  54.         itterator=lambda kw: kw.items()
  55.     print("kw="+str(kw))
  56.     for k,v in itterator(kw):
  57.         print("obj."+str(k)+"="+str(v))
  58.         setattr(obj,k,v)
  59.     #pdb.set_trace()
  60. def hasherDefulats():
  61.   return {\
  62.     'hasher':    md5.md5(),
  63.     'updateHash':lambda hasher,data: seq0(lambda: hasher.update("\n"),
  64.                                           lambda: hasher.update(data)),
  65.     'getDigest': lambda hasher: base64.b64encode(hasher.digest(),'~-')[0:22],
  66.     'copyHash':  lambda hasher: hasher.copy()}
  67.     #    
  68. def setHashWrapper(obj,kw,delete=True):
  69.     defaults=hasherDefulats()
  70.     for key in ('hasher','updateHash','getDigest','copyHash'):
  71.         val=kw.get(key,None)
  72.         if val is not None:
  73.             setattr(obj,key,val)
  74.             if delete==True:
  75.                 del kw[key]
  76.         else:
  77.             setattr(obj,key,defaults[key])
  78. class HashWrapper:
  79.     def __init__(self,kw,delete=True):
  80.         setHashWrapper(self,kw,delete=True)
  81.     def update(self,data):
  82.         return self.updateHash(self.hasher,data)
  83.     def digest(self):
  84.         return self.getDigest(self.hasher)
  85.     def copy(self):
  86.         kw={'hasher':self.copyHash(self.hasher),
  87.             'updateHash':self.updateHash,
  88.             'getDigest':self.getDigest,
  89.             'copyHash':self.copyHash}
  90.         return HashWrapper(kw)
  91. class DictList_For_EncodedPathWatcher(DictList):
  92.     def __init__(self,paths=[],**kw):
  93.         super(DictList,self).__init__(paths)
  94.         #setHasher(self)
  95.         self.hasher=HashWrapper(kw,delete=True)
  96.         if 'encoder' in kw.keys():
  97.             self.encoder=kw['encoder']
  98.         else:
  99.             self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='()?-,'+"'"+'"')      
  100.         #getCallDel(kw,'setHasher',setHasher,self)
  101.         self.before_append=kw.get('before_append',lambda path: None)
  102.         self.after_append=kw.get('after_append',lambda path: None)  
  103.         self.before_pop=kw.get('before_pop',lambda path: None)
  104.         self.after_pop=kw.get('after_pop',lambda path: None)    
  105.         self.root=kw.get('root',"/root/Downloads/pt")    
  106.  
  107.         getCallDel(kw,'setPathDefaults',setPathDefaults,self)
  108.  
  109.         setKwAttr(self,kw)      
  110.         for path in paths:
  111.             self.append(path)
  112.     def append(self,raw_name,**kw):
  113.         #alt(kw,'bname',self.updateBName(kw)) I think we need this
  114.         if raw_name is not None:
  115.             kw['raw_name']=raw_name  
  116.        
  117.         print("raw_name"+str(raw_name))
  118.  
  119.         alt(kw,'bname',lambda: self.encoder(kw['raw_name']))
  120.         if len(self)>0:
  121.             hasher=self[-1]['hasher'].copy()
  122.             alt(kw,'hasher',lambda: hasher)        
  123.             alt(kw,'nesting',lambda: self[-1]['nesting']+1)        
  124.             alt(kw,'full',lambda: self[-1]['full']+self.sep+kw['bname'])
  125.         else:
  126.             hasher=self.hasher.copy()
  127.             alt(kw,'hasher',lambda: hasher)        
  128.             alt(kw,'nesting',lambda: 1)        
  129.             alt(kw,'full',lambda: self.root+self.sep+kw['bname'])      
  130.         #if kw['nesting']>1:
  131.         folded=self.pathFold(kw)
  132.         #else
  133.         #    folded=False
  134.         if len(self)>0:
  135.             self.before_append(self[-1]['full'])
  136.         else:
  137.             pass #TODO, need to think about this
  138.         super(DictList,self).append(kw)
  139.         if folded:
  140.             self.mkLinks()  
  141.         self.after_append(self[-1]['full'])
  142.     def rename(ind=-1,**kw):
  143.         for key,value in kw:
  144.             self[-1][key]=value
  145.         #TODO, add logic for if one gives the bname here without the fullname or visa-versa
  146.     def pop(self):
  147.         last=self[-1]
  148.         self.before_pop(last['full'],self) #No point in return a result.
  149.         super(DictList,self).pop()
  150.        
  151.         if len(self)>0:
  152.             path=self[-1]['full']
  153.         else:
  154.             path=self.root
  155.         self.after_pop(path,last)  
  156.         #return result #I don't think this does anytning useful.
  157.     def getFullname():
  158.         return self[-1].full                      
  159.     def pathFold(self,kw):
  160.         maxNesting=kw.get('maxNesting',self.MAX_NESTING)
  161.         root=kw.get('root',self.root)
  162.  
  163.         path=kw['full']
  164.         pathlen=len(path)
  165.         nesting=kw['nesting']
  166.         if (nesting<=maxNesting or maxNesting==0) and \
  167.            (pathlen<=self.MAX_PATH_LEN):
  168.             return False #No Folding Required.
  169.         else:
  170.             bname=kw['bname']
  171.             hasher=kw['hasher']
  172.             kw['bname']=bname+"-id"+hasher.digest()
  173.             kw['full']=self.root+self.sep+'1'+self.sep+kw['bname'] #TODO give more options for the wrap folder
  174.             return True
  175.             #mkLinks=kw.get('mkLinks',self.mkLinks)
  176.             #mkLinks(paths)  
  177.     def mkLinks(self):#new_fullpath,last_fullpath):
  178.            #dig=m.digest()
  179.            #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
  180.            #self.mkLinks(last.full,dig,dirname)
  181.            
  182.            new_fullpath = self[-1]['full']
  183.            last_fullpath=self[-2]['full']
  184.            bname=self[-1]['bname']
  185.            sep=self.sep
  186.            if not os.path.exists(new_fullpath):
  187.              print("Making directory "+new_fullpath)
  188.              os.makedirs(new_fullpath)
  189.            #TODO Possible bug producting duplicate paths
  190.            if not os.path.exists(new_fullpath+sep+"parent"):
  191.              os.symlink(last_fullpath,new_fullpath+sep+"parent")
  192.            print(str(last_fullpath+sep+bname))
  193.            if not os.path.exists(last_fullpath+sep+bname):
  194.              os.symlink(new_fullpath,last_fullpath+sep+bname)
  195.            #paths.rename(-1,bname=bname,full=new_fullpath)  
  196. #    def set_HashFn(hasher=None,updateHash=None,getDigest=None):
  197. #        if hasher is not None: self.hasher=hasher
  198. #        if updateHash is not None: self.updateHash=updateHash
  199. #        if getDigest is not None: self.getDigest=getDigest
  200. #        if copyHash is not None: self.copyHash=copyHash
  201. #        return self
  202.  
  203.  
  204. class EncodedPathWatcher(SimplePathWatcher):
  205.     def __init__(self,root,**kw):
  206.         #https://stackoverflow.com/questions/11179008/python-inheritance-typeerror-object-init-takes-no-parameters
  207.         super(EncodedPathWatcher,self).__init__(root)    
  208.         self.paths=DictList_For_EncodedPathWatcher([],**kw)
  209.     def on_enter(self,raw_data=None,bname=None,items=None,obj=None):
  210.         if raw_data is None:
  211.             raw_data=self.raw_data
  212.         self.paths.append(raw_data)
  213.     #def mkLinks(self,last,dirname,m)
  214.  
  215.     def on_exit(self):
  216.         self.paths.pop()
  217.     def encode(self,bname):
  218.         return self.encoder(self.bname,self)
  219.     def defaultEncoder(self,bname):
  220.         return urllib.quote(self.bname, safe='()?-,'+"'"+'"')
  221.     #def defaultSymLast(
  222. def storeList(path,obj):
  223.         items=obj.items
  224.         print('storeList')
  225.         os.chdir(str(path))
  226.         print(os.getcwd())
  227.         pickle.dump(items,open( "list.p", "wb" ))
  228.         bname=os.path.basename(path)
  229.         if bname in ():
  230.             print(json.dumps(items,default=jdefault))
  231.             print("store at "+str(path))
  232.             pdb.set_trace()
  233.         obj.items=[]
  234. def mkdir(path):
  235.     if not os.path.exists(str(path)):
  236.         print("Making directory "+str(path))
  237.         os.makedirs(str(path))  
  238. #https://pythontips.com/2013/08/08/storing-and-loading-data-with-json/        
  239. def jdefault(o):
  240.     return o.__dict__        
  241. def loadList(path,obj):
  242.     items=obj.items
  243.     print('loadList')
  244.     print('path='+str(path))
  245.     os.chdir(str(path))
  246.     print(str(path))
  247.     print(os.getcwd())
  248.    
  249.     obj.items=pickle.load(open( "list.p", "rb" ))  
  250.     print(json.dumps(items,default=jdefault))  
  251.     #pdb.set_trace()
  252. class HTML_Link:
  253.     def __init__(self,**kw):
  254.         self.text=kw.get("text"," ")
  255.         self.href=kw.get("href",'/')
  256.         self.add_date=kw.get("add_date",'0')
  257.         self.linkType=kw.get("linkType",None)
  258.  
  259.     def toHTML(self,endSep=''):
  260.        
  261.         if self.linkType is None:
  262.             str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"+endSep
  263.             return str
  264.         elif self.linkType.upper()=="FOLDER":
  265.             str='<A HREF="'+self.href+'">'+self.text+"</A>"          
  266.             return 'Folder: '+str+endSep
  267. class Section:
  268.     def __init__(self,text):
  269.         self.text=text.replace('\n', ' ')
  270.     def toHTML(self,endSep=''):
  271.         return "<H3>"+self.text+"</H3><hr>"
  272. def linkToLastHTML(obj,path,last,paths=None):
  273.     if paths is not None:
  274.         sep=paths.sep
  275.     else:
  276.         sep="/"
  277.     items=obj.items
  278.     last_path=last['full']
  279.     #TODO improve security here and make the code more portable.
  280.     href=urllib.quote(os.popen("realpath --relative-to='"+path+"' '"+last_path+"'").read().rstrip('\n'))
  281.     href=href+sep+"index.html"
  282.     print("href="+href)
  283.     text=last['raw_name']
  284.     linkType='FOLDER'
  285.     items.append(HTML_Link(href=href,text=text,linkType=linkType))
  286.     print(items[-1].toHTML())
  287.     #pdb.set_trace()
  288. class BookMarkParser(sgmllib.SGMLParser):
  289.     def __init__(self,**kw):
  290.         sgmllib.SGMLParser.__init__(self)
  291.         self.STATE="__init__"
  292.         self.items=[]
  293.         self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
  294.  
  295.         if self.watcher is None:
  296.             self.watcher=EncodedPathWatcher(root="/root/Downloads/pt",
  297.                 before_append=lambda path: storeList(path,self),
  298.                 after_append=lambda path: mkdir(path),
  299.                 before_pop=lambda path,paths: HTMLWriter(self,paths).writeList(path),
  300.                 after_pop=lambda path,last: \
  301.                             seq0(lambda: loadList(path,self),
  302.                                  lambda: linkToLastHTML(self,path,last)
  303.                                 )                
  304.             )
  305.             #self.watcher.enterCB=lambda dir: self.storeList(dir)
  306.         self.watchers=[self.watcher]        
  307.        
  308.  
  309.     def NotifyHeaderWatchers(self,data):
  310.         for aWatcher in self.watchers:
  311.             aWatcher.on_header(data,obj=self)
  312.  
  313.     def start_h3(self, attributes):
  314.         print('start_H3')
  315.         self.STATE='Started H3'
  316.         for name, value in attributes:
  317.             print(name+"="+value)
  318.             if (value == 'FOLDED') or (name == 'folded'):
  319.                 self.STATE='FOLDED'
  320.     def handle_data(self,data):
  321.         print('handleData')
  322.         print("self.STATE="+self.STATE)
  323.         if self.STATE=='FOLDED':
  324.             self.NotifyHeaderWatchers(data)
  325.  
  326.             #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
  327.  
  328.             self.STATE="Seeking DL"
  329.  
  330.         if self.STATE=="A":
  331.            
  332.             self.A.text=data
  333.         if self.STATE=="DD":
  334.             data2=data.split('\n')[0].strip()
  335.             if len(data2)>0:
  336.                 self.items.append(Section(data2))
  337.     def end_h3(self): #Probably redundant
  338.         print('end_H3')
  339.         self.FOLDED=False
  340.     def start_dl(self, atributes):
  341.         if self.STATE!="__init__":
  342.             print('start_DL')
  343.             for watcher in self.watchers:
  344.                 watcher.on_enter()  
  345.             self.items=[]
  346.     def end_dl(self):
  347.         if self.STATE!="__init__":
  348.             print('end_DL')
  349.             for watcher in self.watchers:
  350.                 watcher.on_exit()          
  351.     def start_a(self,atributes):
  352.         if self.STATE!="__init__":
  353.             print('start_A')
  354.             self.A=HTML_Link()
  355.             for key,value in atributes:
  356.                 setattr(self.A,key,value)
  357.         self.STATE='A'
  358.     def end_a(self):
  359.         if self.STATE!="__init__":
  360.             print('end_A')
  361.             self.items.append(self.A)
  362.             self.A=None
  363.             self.STATE='Ended A'
  364.     def do_dd(self, atributes):
  365.         if self.STATE!="__init__":
  366.             print('do_DD')
  367.             self.STATE="DD"
  368.     def do_dt(self, atributes):
  369.         if self.STATE!="__init__":
  370.             print('do_DT')
  371.             self.STATE="DT"
  372. class HTMLWriter:
  373.     def __init__(self,obj,paths=None,maxParents=0):
  374.         self.items=obj.items
  375.         self.paths=paths
  376.         self.maxParents=0
  377.     def writeHTMLHeader(self,f):
  378.         print('WriteHTMLHeader')
  379.         f.write("%s\n" % '<!DOCTYPE html>')
  380.         f.write("%s\n" % '<html>')
  381.         f.write("%s\n" % '<body>')
  382.     def writePath(self,f):
  383.         f.write("<b>Reverse Path:</b> ")
  384.         count=1
  385.         paths=self.paths
  386.         f.write('<a href=".">'+paths[-1]['bname']+"</a>"+" | ")
  387.         href=".."
  388.         for i in range(len(paths)-2,-1,-1):
  389.             f.write('<a href="'+href+'/index.html">'+paths[i]['bname']+"</a>")
  390.             count=count+1
  391.             if (count>= self.maxParents) and (self.maxParents!=0):
  392.                 f.write("<br>")
  393.                 break
  394.             else:
  395.                 href=href+"/.." #TODO make this more platform independent
  396.                 f.write(" |")
  397.         f.write("<hr><br>")
  398.     def writeHTMLFooter(self,f):
  399.         print('writeHTMLFooter')
  400.         f.write("%s\n" % '</body>')        
  401.         f.write("%s\n" % '</html>')  
  402.     def writeList(self,path):
  403.         print('writeList')
  404.         os.chdir(str(path))
  405.         bname=os.path.basename(path)
  406.         if bname in ():
  407.             print("writeList "+bname)
  408.             print(json.dumps(self.items,default=jdefault))  
  409.             pdb.set_trace()
  410.             debug_break=True
  411.         else:
  412.             debug_break=False
  413.         with open('index.html', 'w') as f:
  414.             self.writeHTMLHeader(f)
  415.             self.writePath(f)
  416.             for item in self.items:
  417.                 if item is not None:
  418.                     out_str="%s\n" % item.toHTML('<br>')                        
  419.                 else:
  420.                     out_str="<b>Empty Item!!!!</b>"
  421.                 f.write(out_str)
  422.                 if debug_break:
  423.                     print("out_str="+out_str)
  424.                     pdb.set_trace()
  425.             self.writeHTMLFooter(f)        
  426.        
  427. p = BookMarkParser()
  428. filename='/root/Downloads/pearltrees_export.html'
  429. f = open(filename, "r")
  430. BUFSIZE = 8192
  431. while True:
  432.     #data = f.read(BUFSIZE)
  433.     data=f.readline()
  434.     print('data='+str(data))
  435.     if not data: break
  436.     p.feed(data)
  437. p.close(  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement