Advertisement
s243a

PT Bookmark Parser4

Dec 9th, 2018
259
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.85 KB | None | 0 0
  1. import sgmllib, urllib, urlparse, os, pickle, md5
  2. class HTML_Link:
  3.     def __init__(self,**kw):
  4.         self.text=kw.get("text"," ")
  5.         self.href=kw.get("href",'/')
  6.         self.add_date=kw.get("add_date",'0')
  7.         self.linkType=kw.get("linkType",None)
  8.  
  9.     def toHTML(self):
  10.        
  11.         if self.linkType is None:
  12.             str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"
  13.             return str
  14.         elif self.linkType.upper()=="FOLDER":
  15.             str='<A HREF="'+self.href+'">'+self.text+"</A>"          
  16.             return '<b>Folder:</b>'+str
  17. class Section:
  18.     def __init__(self,text):
  19.         self.text=text
  20.     def toHTML(self):
  21.         return "<H3>"+self.text+"</H3>"
  22. class SimpePathWatcher:
  23.     def __init__(self,root):
  24.         self.root=root
  25.         self.bname=None
  26.     def on_Header(self,bname):
  27.         self.bname=bname
  28.     def on_Enter(self):
  29.         pass
  30.     def on_Exit(self):
  31.         pass
  32.     def get_Dir():
  33.         self.items[-1]['full']
  34. class DictList(list):
  35.     def __init__(self,*args):
  36.         super(DictList,self).__init__(args)
  37.     def append(self,**kw):
  38.         super(DictList,self).append(kw)
  39. alt=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else b
  40. intab = " _"
  41. outtab = "_~"
  42. trantab = maketrans(intab, outtab)
  43. def seq0(*args):
  44.     for arg in args:
  45.         arg()
  46. class DictList_For_EncodedPathWatcher(DictList):
  47.     def __init__(self,*args):
  48.         super(DictList,self).__init__(args)
  49.         self.hasher=md5()
  50.         self.updateHash=lambda hasher,data: seq0(lambda: hasher.update("\n"),
  51.                                                 lambda: hasher.update(data))
  52.         self.getDigest=lambda hasher: hasher.digest()
  53.         self.copyHash=lambda hasher: hasher.copy()
  54.         self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='')
  55.         self.sep=kw.get('sep','/')
  56.         self.MAX_PATH_LEN=kw.get('MAX_PATH_LEN',200)#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  57.         self.MAX_NESTING=KW.GET('MAX_NESTING',0) #0 Means no limit on the amount of nexted directories.
  58.     def set_HashFn(hasher=None,updateHash=None,getDigest=None):
  59.         if hasher is not None: self.hasher=hasher
  60.         if updateHash is not None: self.updateHash=updateHash
  61.         if getDigest is not None: self.getDigest=getDigest
  62.         if copyHash is not None: self.copyHash=copyHash
  63.         return self
  64.     def append(self,**kw):
  65.         #alt(kw,'bname',self.updateBName(kw)) I think we need this
  66.         if 'raw_name' not in d: #Rawname is the unencoded "folder name"
  67.             kw['raw_name']=kw['bname']
  68.         else:
  69.             alt(kw,'bname',self.encoder(kw['raw_name'])
  70.         kw['hasher']=alt(kw,'hasher',self.updateHash(self[-1]['hasher'],kw['raw_name'))        
  71.         kw['nesting']=alt(kw,'nesting',self[-1]+1)        
  72.         kw['full']=alt(kw,'full',self[-1].full+self.sep+kw['bname'])
  73.  
  74.         super(DictList,self).append(kw)  
  75.     def __updateMD5(hasher,kw):
  76.         self.updateHash(self.hasher,'\n')
  77.         self.updatehash(self.hasher,kw['bname'])
  78.         kw['hasher']=self.copyHash(self.hasher,hasher)
  79.     def rename(ind=-1,**kw):
  80.         for key,value in kw
  81.             self[-1][key]=value
  82.         #TODO, add logic for if one gives the bname here without the fullname or visa-versa
  83.     def pop():
  84.         super(DictList,self).pop()
  85.         self.hasher=self.copyHash(self[-1][hasher])
  86.     def getFullname():
  87.         return self[-1].full
  88.     def
  89. class EncodedPathWatcher(SimplePathWatcher):
  90.     def __init__(self,root,**kw):
  91.         super(SimplePathWatcher,self).__init__(root)                
  92.         self.items=DictList_For_EncodedPathWatcher()
  93.         #self.sep=kw.get('seperator','/')
  94.         self.writePath,self.fname=kw.get('writePath',(False,encodepath.txt))
  95.         #self.encoder=kw.get('encoder',self.defaultEncoder)
  96.         #self.symLastCB=kw.get('symLast',defualtSymLast)
  97.         #self.SymLastWhen=kw.get('SymLastWhen','SOMETIMES')
  98.         #self.MAX_PATH_LEN=kw.get('MAX_PATH_LEN',200)#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  99.         #self.MAX_NESTING=KW.GET('MAX_NESTING',0) #0 Means no limit on the amount of nexted directories.
  100.         self.hashFn=
  101.     def on_Enter(self,bname=None,items=None,obj=None):
  102.         if bname is None: bname=obj.bname #probably not useful and can be ommited
  103.         if items is None: items=obj.items #probably not useful and can be ommited
  104.  
  105.         self.items.append(bname=self.bname)
  106.     def pathFold(self,paths,**kw):
  107.         maxNesting=kw.get('maxNesting',self.MAX_NESTING)
  108.         root=kw.get('root',self.root)
  109.  
  110.         path=paths[-1].full: pathlen=len(path)
  111.         nesting=paths[-1]['nesting']
  112.         if (nesting<=maxNesting or maxNesting==0) and \
  113.            (pathlen<=self.MAX_PATH_LEN):
  114.             pass #No Folding Required.
  115.         else
  116.             mkLinks=kw.get('mkLinks',self.mkLinks)
  117.             mkLinks(paths)
  118.      
  119.    
  120.     #def mkLinks(self,last,dirname,m)
  121.     def mkLinks(self,paths)
  122.            #dig=m.digest()
  123.            #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
  124.            #self.mkLinks(last.full,dig,dirname)
  125.            bname=dirname+"-id"+paths['hasher'].digest()
  126.            new_fullpath=self.root+paths.sep+dirname2
  127.            last_fullpath=paths[-2].full
  128.            sep=paths.sep
  129.            if not os.path.exists(newdir):
  130.              print("Making directory "+new_fullpath)
  131.              os.makedirs(new_fullpath)
  132.            os.symlink(last_fullpath,new_fullpath+sep+"parent")
  133.            os.symlink(newdir,last_fullpath+sep+bname)
  134.            paths.rename(-1,bname=bname,full=new_fullpath)    
  135.     def on_Exit(self):
  136.         self.items.pop()
  137.     def encode(self,bname):
  138.         return self.encoder(self.bname,self)
  139.     def defaultEncoder(self,bname):
  140.         return urllib.quote(self.bname, safe='')
  141.     def defaultSymLast(
  142. class BookMarkParser(sgmllib.SGMLParser):
  143.     def __init__(self,**kw):
  144.         sgmllib.SGMLParser.__init__(self)
  145.         self.STATE="__init__"
  146.         self.DL=[]
  147.         self.items=[]
  148.         #self.wd=kw.get('wd','/root/Downloads/pt') #lets put this info in a watcher instead
  149.         self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
  150.  
  151.         if watcher is None:
  152.             self.watcher=EncodedPathWatcher(/root/Downloads/pt,
  153.                                             pickle=True) #Pickle stores binary representation of times when leaving directory
  154.         self.watchers=[self.watcher]        
  155.         if not os.path.exists(self.wd):
  156.             print("Making directory "+self.wd)
  157.             os.makedirs(self.wd)
  158.        
  159.     def writeHTMLHeader(self,f):
  160.         print('WriteHTMLHeader')
  161.         f.write("%s\n" % '<!DOCTYPE html>')
  162.         f.write("%s\n" % '<html>')
  163.         f.write("%s\n" % '<body>')
  164.     def writeHTMLFooter(self,f):
  165.         print('writeHTMLFooter')
  166.         f.write("%s\n" % '</body>')        
  167.         f.write("%s\n" % '</html>')  
  168.     def writeList(self):
  169.         print('writeList')
  170.         with open('index.html', 'w') as f:
  171.             self.writeHTMLHeader(f)
  172.             for item in self.items:
  173.                 if item is not None:
  174.                     f.write("%s\n" % item.toHTML())
  175.                 else:
  176.                     f.write("<b>Empty Item!!!!</b>")
  177.                
  178.             self.writeHTMLFooter(f)
  179.     def NotifyHeaderWatchers(data)
  180.         for aWatcher in self.watchers:
  181.             aWatcher.onHeader(data,self)
  182.         #The watcher might not use the callback self.storeList (e.g. storring stuff on a remote directory)
  183.         #self.watcher.storeList(self.items,lambda x: self.storeList(x))
  184.  
  185.     def start_h3(self, attributes):
  186.         print('start_H3')
  187.         self.STATE='Started H3'
  188.         for name, value in attributes:
  189.             print(name+"="+value)
  190.             if (value == 'FOLDED') or (name == 'folded'):
  191.                 self.STATE='FOLDED'
  192.     def handle_data(self,data):
  193.         print('handleData')
  194.         print("self.STATE="+self.STATE)
  195.         if self.STATE=='FOLDED':
  196.             #dirname = urllib.quote(data, safe='')
  197.             #dirname2= urllib.quote(dirname, safe='')
  198.             self.NotifyHeaderWatchers(data)
  199.             #self.items.append(HTML_Link(href=dirname2+"/index.html",linkType='FOLDER',text=data))
  200.  
  201.             self.wd=os.path.join(self.wd,dirname)
  202.             print("self.wd="+self.wd)
  203.             #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
  204.             if not os.path.exists(self.wd):
  205.                 print("Making directory "+self.wd)
  206.                 os.makedirs(self.wd)
  207.             print("Entering:" + str(self.wd))
  208.             self.STATE="Seeking DL"
  209.  
  210.         if self.STATE=="A":
  211.            
  212.             self.A.text=data
  213.         if self.STATE=="DD":
  214.             self.items.append(Section(data.split('\n')[0]))
  215.     def end_h3(self): #Probably redundant
  216.         print('end_H3')
  217.         self.FOLDED=False
  218.     def start_dl(self, atributes):
  219.         print('start_DL')
  220.         for item in self.items
  221.             item.on_enter(lambda dir: self.storeList(dir))
  222.         self.storeList()        
  223.         self.items=[]
  224.         #print(self.DL)
  225.         #print(self.wd)
  226.         self.enterDir()
  227.         #
  228.     def storeList(self,dir):
  229.         print('storeList')
  230.         os.chdir(dir)
  231.         print(os.getcwd())
  232.         pickle.dump(self.items,open( "list.p", "wb" ))
  233.  
  234.     def enterDir():
  235.         for watcher in self.watchers:
  236.             watcher.on_Enter(bname,self)
  237.         #self.DL.append(os.path.basename(self.wd)) #Maybe append the full path here instead of the basename    
  238.  
  239.     def end_dl(self):
  240.         print('end_DL')
  241.         self.writeList()
  242.         self.DL.pop
  243.         self.wd=os.path.join(self.wd,"..")
  244.         self.loadList()
  245.     def loadList(self):
  246.         print('loadList')
  247.         os.chdir(self.wd)
  248.         print(self.wd)
  249.         print(os.getcwd())
  250.         self.items=pickle.load(open( "list.p", "rb" ))        
  251.     def start_a(self,atributes):
  252.         print('start_A')
  253.         self.A=HTML_Link()
  254.         for key,value in atributes:
  255.            setattr(self.A,key,value)
  256.         self.STATE='A'
  257.     def end_a(self):
  258.         print('end_A')
  259.         self.items.append(self.A)
  260.         self.A=None
  261.         self.STATE='Ended A'
  262.     def do_dd(self, atributes):
  263.         print('do_DD')
  264.         self.STATE="DD"
  265.     def do_dt(self, atributes):
  266.         print('do_DT')
  267.         self.STATE="DT"
  268. p = BookMarkParser()
  269. filename='/root/Downloads/pearltrees_export.html'
  270. f = open(filename, "r")
  271. BUFSIZE = 8192
  272. while True:
  273.     #data = f.read(BUFSIZE)
  274.     data=f.readline()
  275.     print('data='+str(data))
  276.     if not data: break
  277.     p.feed(data)
  278. p.close(  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement