Advertisement
s243a

PT Bookmark Parser Ver 2.0

Dec 10th, 2018
239
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.69 KB | None | 0 0
  1. import sgmllib, urllib, urlparse, os, pickle, md5, base64
  2. import pdb #For debugging
  3. #https://stackoverflow.com/questions/3031045/how-come-string-maketrans-does-not-work-in-python-3-1
  4. from string import maketrans #Import might be slightly different in python 3
  5. class HTML_Link:
  6.     def __init__(self,**kw):
  7.         self.text=kw.get("text"," ")
  8.         self.href=kw.get("href",'/')
  9.         self.add_date=kw.get("add_date",'0')
  10.         self.linkType=kw.get("linkType",None)
  11.  
  12.     def toHTML(self,endSep=''):
  13.        
  14.         if self.linkType is None:
  15.             str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"+endSep
  16.             return str
  17.         elif self.linkType.upper()=="FOLDER":
  18.             str='<A HREF="'+self.href+'">'+self.text+"</A>"          
  19.             return '<b>Folder:</b>'+str
  20. class Section:
  21.     def __init__(self,text):
  22.         self.text=text
  23.     def toHTML(self,endSep=''):
  24.         return "<H3>"+self.text+"</H3>"
  25. #https://stackoverflow.com/questions/9698614/super-raises-typeerror-must-be-type-not-classobj-for-new-style-class
  26. #https://stackoverflow.com/questions/9699591/instance-is-an-object-but-class-is-not-a-subclass-of-object-how-is-this-po/9699961#9699961
  27. class SimplePathWatcher(object):
  28.     def __init__(self,root):
  29.         self.root=root
  30.         self.raw_data=None
  31.     def on_Header(self,raw_data,obj=None,state=None):
  32.         if state=="More":
  33.             self.raw_data=self.raw_data+raw_data
  34.         else:
  35.             self.raw_data=raw_data
  36.     def on_Enter(self):
  37.         pass
  38.     def on_Exit(self):
  39.         pass
  40.     def get_Dir():
  41.         self.items[-1]['full']
  42. class DictList(list):
  43.     def __init__(self,*args):
  44.         super(DictList,self).__init__(args)
  45.     def append(self,**kw):
  46.         super(DictList,self).append(kw)
  47. alt_fn=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else a()
  48. def alt(d,k,a):
  49.     d[k]=alt_fn(d,k,a)
  50. intab = " _"
  51. outtab = "_~"
  52. transtab = maketrans(intab, outtab)
  53. def seq0(*args):
  54.     for arg in args:
  55.         arg()
  56.        
  57.          
  58. def setPathDefaults(obj):
  59.         obj.sep='/'
  60.         obj.MAX_PATH_LEN=200#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  61.         obj.MAX_NESTING=0 #0 Means no limit on the amount of nexted directories.    
  62. def getCallDel(kw,key,default,args):
  63.     fn=kw.get(key,default)
  64.     if fn is not None:
  65.         return fn(args)
  66.         del kw[key]
  67. def setKwAttr(obj,kw):
  68.     print(kw)
  69.     #https://stackoverflow.com/questions/5466618/too-many-values-to-unpack-iterating-over-a-dict-key-string-value-list
  70.     if hasattr(kw,'iteritems'): #Python 2 case
  71.         itterator=lambda kw: kw.iteritems()
  72.     else: #Python 3 case
  73.         itterator=lambda kw: kw.items()
  74.     print("kw="+str(kw))
  75.     for k,v in itterator(kw):
  76.         print("obj."+str(k)+"="+str(v))
  77.         setattr(obj,k,v)
  78.     #pdb.set_trace()
  79. def hasherDefulats():
  80.   return {\
  81.     'hasher':    md5.md5(),
  82.     'updateHash':lambda hasher,data: seq0(lambda: hasher.update("\n"),
  83.                                           lambda: hasher.update(data)),
  84.     'getDigest': lambda hasher: base64.b64encode(hasher.digest(),'~-'),
  85.     'copyHash':  lambda hasher: hasher.copy()}
  86.     #    
  87. def setHashWrapper(obj,kw,delete=True):
  88.     defaults=hasherDefulats()
  89.     for key in ('hasher','updateHash','getDigest','copyHash'):
  90.         val=kw.get(key,None)
  91.         if val is not None:
  92.             setattr(obj,key,val)
  93.             if delete==True:
  94.                 del kw[key]
  95.         else:
  96.             setattr(obj,key,defaults[key])
  97. class HashWrapper:
  98.     def __init__(self,kw,delete=True):
  99.         setHashWrapper(self,kw,delete=True)
  100.     def update(self,data):
  101.         return self.updateHash(self.hasher,data)
  102.     def digest(self):
  103.         return self.getDigest(self.hasher)
  104.     def copy(self):
  105.         kw={'hasher':self.copyHash(self.hasher),
  106.             'updateHash':self.updateHash,
  107.             'getDigest':self.getDigest,
  108.             'copyHash':self.copyHash}
  109.         return HashWrapper(kw)
  110. class DictList_For_EncodedPathWatcher(DictList):
  111.     def __init__(self,paths=[],**kw):
  112.         super(DictList,self).__init__(paths)
  113.         #setHasher(self)
  114.         self.hasher=HashWrapper(kw,delete=True)
  115.         if 'encoder' in kw.keys():
  116.             self.encoder=kw['encoder']
  117.         else:
  118.             self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='()?-,'+"'"+'"')      
  119.         #getCallDel(kw,'setHasher',setHasher,self)
  120.         self.before_append=kw.get('before_append',lambda path: None)
  121.         self.after_append=kw.get('after_append',lambda path: None)  
  122.         self.before_pop=kw.get('before_pop',lambda path: None)
  123.         self.after_pop=kw.get('after_pop',lambda path: None)    
  124.         self.root=kw.get('root',"/root/Downloads/pt")    
  125.  
  126.         getCallDel(kw,'setPathDefaults',setPathDefaults,self)
  127.  
  128.         setKwAttr(self,kw)      
  129.         for path in paths:
  130.             self.append(path)
  131.     def append(self,raw_name,**kw):
  132.         #alt(kw,'bname',self.updateBName(kw)) I think we need this
  133.         if raw_name is not None:
  134.             kw['raw_name']=raw_name  
  135.        
  136.         print("raw_name"+str(raw_name))
  137.  
  138.         alt(kw,'bname',lambda: self.encoder(kw['raw_name']))
  139.         if len(self)>0:
  140.             hasher=self[-1]['hasher'].copy()
  141.             alt(kw,'hasher',lambda: hasher)        
  142.             alt(kw,'nesting',lambda: self[-1]['nesting']+1)        
  143.             alt(kw,'full',lambda: self[-1]['full']+self.sep+kw['bname'])
  144.         else:
  145.             hasher=self.hasher.copy()
  146.             alt(kw,'hasher',lambda: hasher)        
  147.             alt(kw,'nesting',lambda: 1)        
  148.             alt(kw,'full',lambda: self.root+self.sep+kw['bname'])      
  149.         #if kw['nesting']>1:
  150.         folded=self.pathFold(kw)
  151.         #else
  152.         #    folded=False
  153.         if len(self)>0:
  154.             self.before_append(self[-1]['full'])
  155.         else:
  156.             pass #TODO, need to think about this
  157.         super(DictList,self).append(kw)
  158.         if folded:
  159.             self.mkLinks()  
  160.         self.on_append(self[-1]['full'])
  161.     def rename(ind=-1,**kw):
  162.         for key,value in kw:
  163.             self[-1][key]=value
  164.         #TODO, add logic for if one gives the bname here without the fullname or visa-versa
  165.     def pop(self):
  166.         self.before_pop(self[-1]['full']) #No point in return a result.
  167.         super(DictList,self).pop()
  168.         self.after_pop(self[-1]['full'])
  169.         #return result #I don't think this does anytning useful.
  170.     def getFullname():
  171.         return self[-1].full                      
  172.     def pathFold(self,kw):
  173.         maxNesting=kw.get('maxNesting',self.MAX_NESTING)
  174.         root=kw.get('root',self.root)
  175.  
  176.         path=kw['full']
  177.         pathlen=len(path)
  178.         nesting=kw['nesting']
  179.         if (nesting<=maxNesting or maxNesting==0) and \
  180.            (pathlen<=self.MAX_PATH_LEN):
  181.             return False #No Folding Required.
  182.         else:
  183.             bname=kw['bname']
  184.             hasher=kw['hasher']
  185.             kw['bname']=bname+"-id"+hasher.digest()
  186.             kw['full']=self.root+self.sep+kw['bname']
  187.             return True
  188.             #mkLinks=kw.get('mkLinks',self.mkLinks)
  189.             #mkLinks(paths)  
  190.     def mkLinks(self):#new_fullpath,last_fullpath):
  191.            #dig=m.digest()
  192.            #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
  193.            #self.mkLinks(last.full,dig,dirname)
  194.            
  195.            new_fullpath = self[-1]['full']
  196.            last_fullpath=self[-2]['full']
  197.            bname=self[-1]['bname']
  198.            sep=self.sep
  199.            if not os.path.exists(new_fullpath):
  200.              print("Making directory "+new_fullpath)
  201.              os.makedirs(new_fullpath)
  202.            #TODO Possible bug producting duplicate paths
  203.            if not os.path.exists(new_fullpath+sep+"parent"):
  204.              os.symlink(last_fullpath,new_fullpath+sep+"parent")
  205.            print(str(last_fullpath+sep+bname))
  206.            if not os.path.exists(last_fullpath+sep+bname):
  207.              os.symlink(new_fullpath,last_fullpath+sep+bname)
  208.            #paths.rename(-1,bname=bname,full=new_fullpath)  
  209. #    def set_HashFn(hasher=None,updateHash=None,getDigest=None):
  210. #        if hasher is not None: self.hasher=hasher
  211. #        if updateHash is not None: self.updateHash=updateHash
  212. #        if getDigest is not None: self.getDigest=getDigest
  213. #        if copyHash is not None: self.copyHash=copyHash
  214. #        return self
  215.  
  216.  
  217. class EncodedPathWatcher(SimplePathWatcher):
  218.     def __init__(self,root,**kw):
  219.         #https://stackoverflow.com/questions/11179008/python-inheritance-typeerror-object-init-takes-no-parameters
  220.         super(EncodedPathWatcher,self).__init__(root)    
  221.         self.paths=DictList_For_EncodedPathWatcher([],**kw)
  222.     def on_enter(self,raw_data=None,bname=None,items=None,obj=None):
  223.         if raw_data is None:
  224.             raw_data=self.raw_data
  225.         self.paths.append(raw_data)
  226.     #def mkLinks(self,last,dirname,m)
  227.  
  228.     def on_exit(self):
  229.         self.paths.pop()
  230.     def encode(self,bname):
  231.         return self.encoder(self.bname,self)
  232.     def defaultEncoder(self,bname):
  233.         return urllib.quote(self.bname, safe='()?-,'+"'"+'"')
  234.     #def defaultSymLast(
  235. def storeList(path,getItems):
  236.         print('storeList')
  237.         os.chdir(str(path))
  238.         print(os.getcwd())
  239.         pickle.dump(getItems(),open( "list.p", "wb" ))
  240. def mkdir(path):
  241.     if not os.path.exists(str(path)):
  242.         print("Making directory "+str(path))
  243.         os.makedirs(str(path))  
  244. def loadList(path,items):
  245.     print('loadList')
  246.     print('path='+str(path))
  247.     os.chdir(str(path))
  248.     print(str(path))
  249.     print(os.getcwd())
  250.     items=pickle.load(open( "list.p", "rb" ))      
  251. class BookMarkParser(sgmllib.SGMLParser):
  252.     def __init__(self,**kw):
  253.         sgmllib.SGMLParser.__init__(self)
  254.         self.STATE="__init__"
  255.         self.items=[]
  256.         self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
  257.  
  258.         if self.watcher is None:
  259.             self.watcher=EncodedPathWatcher(root="/root/Downloads/pt",
  260.                 before_append=lambda path: storeList(path,lambda: self.items),
  261.                 on_append=lambda path: mkdir(path),
  262.                 before_pop=lambda path: HTMLWriter(self.items).writeList(path),
  263.                 after_pop=lambda path: loadList(path,self.items)
  264.             )
  265.             #self.watcher.enterCB=lambda dir: self.storeList(dir)
  266.         self.watchers=[self.watcher]        
  267.        
  268.  
  269.     def NotifyHeaderWatchers(self,data):
  270.         for aWatcher in self.watchers:
  271.             aWatcher.on_Header(data,obj=self)
  272.  
  273.     def start_h3(self, attributes):
  274.         print('start_H3')
  275.         self.STATE='Started H3'
  276.         for name, value in attributes:
  277.             print(name+"="+value)
  278.             if (value == 'FOLDED') or (name == 'folded'):
  279.                 self.STATE='FOLDED'
  280.     def handle_data(self,data):
  281.         print('handleData')
  282.         print("self.STATE="+self.STATE)
  283.         if self.STATE=='FOLDED':
  284.             self.NotifyHeaderWatchers(data)
  285.  
  286.             #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
  287.  
  288.             self.STATE="Seeking DL"
  289.  
  290.         if self.STATE=="A":
  291.            
  292.             self.A.text=data
  293.         if self.STATE=="DD":
  294.             self.items.append(Section(data.split('\n')[0]))
  295.     def end_h3(self): #Probably redundant
  296.         print('end_H3')
  297.         self.FOLDED=False
  298.     def start_dl(self, atributes):
  299.         if self.STATE!="__init__":
  300.             print('start_DL')
  301.             for watcher in self.watchers:
  302.                 watcher.on_enter()  
  303.             self.items=[]
  304.     def end_dl(self):
  305.         if self.STATE!="__init__":
  306.             print('end_DL')
  307.             for watcher in self.watchers:
  308.                 watcher.on_exit()          
  309.     def start_a(self,atributes):
  310.         if self.STATE!="__init__":
  311.             print('start_A')
  312.             self.A=HTML_Link()
  313.             for key,value in atributes:
  314.                 setattr(self.A,key,value)
  315.         self.STATE='A'
  316.     def end_a(self):
  317.         if self.STATE!="__init__":
  318.             print('end_A')
  319.             self.items.append(self.A)
  320.             self.A=None
  321.             self.STATE='Ended A'
  322.     def do_dd(self, atributes):
  323.         if self.STATE!="__init__":
  324.             print('do_DD')
  325.             self.STATE="DD"
  326.     def do_dt(self, atributes):
  327.         if self.STATE!="__init__":
  328.             print('do_DT')
  329.             self.STATE="DT"
  330. class HTMLWriter:
  331.     def __init__(self,items):
  332.         self.items=items
  333.     def writeHTMLHeader(self,f):
  334.         print('WriteHTMLHeader')
  335.         f.write("%s\n" % '<!DOCTYPE html>')
  336.         f.write("%s\n" % '<html>')
  337.         f.write("%s\n" % '<body>')
  338.     def writeHTMLFooter(self,f):
  339.         print('writeHTMLFooter')
  340.         f.write("%s\n" % '</body>')        
  341.         f.write("%s\n" % '</html>')  
  342.     def writeList(self,path):
  343.         print('writeList')
  344.         os.chdir(str(path))
  345.         with open('index.html', 'w') as f:
  346.             self.writeHTMLHeader(f)
  347.             for item in self.items:
  348.                 if item is not None:
  349.                     f.write("%s\n" % item.toHTML('<br>'))
  350.                 else:
  351.                     f.write("<b>Empty Item!!!!</b>")
  352.                
  353.             self.writeHTMLFooter(f)        
  354.        
  355. p = BookMarkParser()
  356. filename='/root/Downloads/pearltrees_export.html'
  357. f = open(filename, "r")
  358. BUFSIZE = 8192
  359. while True:
  360.     #data = f.read(BUFSIZE)
  361.     data=f.readline()
  362.     print('data='+str(data))
  363.     if not data: break
  364.     p.feed(data)
  365. p.close(  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement