Advertisement
s243a

PT Bookmark Parser3

Dec 7th, 2018
297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.21 KB | None | 0 0
  1. import sgmllib, urllib, urlparse, os, pickle, md5
  2. class HTML_Link:
  3.     def __init__(self,**kw):
  4.         self.text=kw.get("text"," ")
  5.         self.href=kw.get("href",'/')
  6.         self.add_date=kw.get("add_date",'0')
  7.         self.linkType=kw.get("linkType",None)
  8.  
  9.     def toHTML(self):
  10.        
  11.         if self.linkType is None:
  12.             str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"
  13.             return str
  14.         elif self.linkType.upper()=="FOLDER":
  15.             str='<A HREF="'+self.href+'">'+self.text+"</A>"          
  16.             return '<b>Folder:</b>'+str
  17. class Section:
  18.     def __init__(self,text):
  19.         self.text=text
  20.     def toHTML(self):
  21.         return "<H3>"+self.text+"</H3>"
  22. class SimpePathWatcher:
  23.     __init__(self,root):
  24.         self.root=root
  25.         self.bname=None
  26.     def On_Header(self,bname)
  27.         self.bname=bname
  28.     def On_Enter(self):
  29.         pass
  30.     def On_Exit(self):
  31.         pass
  32.     def get_Dir()
  33.         self.items[-1]['full']
  34. class DictList(list):
  35.     __init__(*args):
  36.         super(DictList,args).__init__(for arg in `args)
  37.     def append(**kw):
  38.         super(DictList,args).append(kw)
  39.    
  40. class EncodedPathWatcher(SimplePathWatcher):
  41.     __init__(self,root,**kw):
  42.         super(SimplePathWatcher,self).__init__(root)                
  43.         self.items=DictList()
  44.         self.sep=kw.get('seperator','/')
  45.         self.writePath,self.fname=kw.get('writePath',(False,encodepath.txt))
  46.         self.encoder=kw.get('encoder',self.defaultEncoder)
  47.         self.symLastCB=kw.get('symLast',defualtSymLast)
  48.         self.SymLastWhen=kw.get('SymLastWhen','SOMETIMES')
  49.         self.MAX_PATH_LEN=kw.get('MAX_PATH_LEN',200)#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
  50.         self.MAX_NESTING=KW.GET('MAX_NESTING',0) #0 Means no limit on the amount of nexted directories.
  51.         self.hashFn=
  52.     def On_Enter(self):
  53.         dirname = self.encode(self.bname)#Self.bname is from superclass (i.e. SimplePathWatcher)
  54.         #dirname2= urllib.quote(dirname, safe='') #Maybe use this if we want to write links
  55.  
  56.         if self.item.len()=0:
  57.             m=md5.new()
  58.             pathLen=0
  59.             nesting=0
  60.         else
  61.             last=self.items[-1]
  62.             pathLen=last['pathLen']
  63.             nesting=last.nesting+1
  64.             m=last['md4'].copy()
  65.         m.update("\n")
  66.         m.update(self.bname) #We're hashing the unencoded name.        
  67.         if (nesting<=self.MAX_NESTING or self.MAX_NESTING==0) and \
  68.            (pathlen<=self.MAX_PATH_LEN):
  69.             if self.item.len()==0:
  70.                 fullpath=self.root+sep+dirname
  71.             else:      
  72.                 fullpath=last['full']+self.sep+dirname
  73.             self.items.append({'bname':dirname,
  74.                                'full':fullpath,
  75.                                'nesting':last['nesting']+1,
  76.                                'md5':m)
  77.         else
  78.             (dirname2,newdir)=mkLinks(last,dirname,m)
  79.             self.items.append({'bname':dirname2,
  80.                                'full':newdir,
  81.                                'nesting':last.nesting+1,
  82.                                'pathLen':fullpath.len()
  83.                                'md5':m})          
  84.     def mkLinks(self,last,dirname,m)
  85.            dig=m.digest()
  86.            dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
  87.            self.mkLinks(last.full,dig,dirname)
  88.            dirname2=dirname+"-md5"+dig2
  89.            newdir=self.root+self.sep+dirname2
  90.            if not os.path.exists(newdir):
  91.              print("Making directory "+self.wd)
  92.              os.makedirs(newdir)
  93.            os.symlink(last.full,newdir)
  94.            os.symlink(newdir,last.full+self.sep+dirname)
  95.            return dirname2,newdir)        
  96.     def On_Exit(self):
  97.         self.items.pop()
  98.     def encode(self,bname):
  99.         return self.encoder(self.bname,self)
  100.     def defaultEncoder(self,bname):
  101.         return urllib.quote(self.bname, safe='')
  102.     def defaultSymLast(
  103. class BookMarkParser(sgmllib.SGMLParser):
  104.     def __init__(self,**kw):
  105.         sgmllib.SGMLParser.__init__(self)
  106.         self.STATE="__init__"
  107.         self.DL=[]
  108.         self.items=[]
  109.         #self.wd=kw.get('wd','/root/Downloads/pt') #lets put this info in a watcher instead
  110.         self.watcher=kw.get('watcher',None)
  111.         if watcher is None:
  112.             self.watcher=EncodedPathWatcher(/root/Downloads/pt)
  113.         if not os.path.exists(self.wd):
  114.             print("Making directory "+self.wd)
  115.             os.makedirs(self.wd)
  116.        
  117.     def writeHTMLHeader(self,f):
  118.         print('WriteHTMLHeader')
  119.         f.write("%s\n" % '<!DOCTYPE html>')
  120.         f.write("%s\n" % '<html>')
  121.         f.write("%s\n" % '<body>')
  122.     def writeHTMLFooter(self,f):
  123.         print('writeHTMLFooter')
  124.         f.write("%s\n" % '</body>')        
  125.         f.write("%s\n" % '</html>')  
  126.     def writeList(self):
  127.         print('writeList')
  128.         with open('index.html', 'w') as f:
  129.             self.writeHTMLHeader(f)
  130.             for item in self.items:
  131.                 if item is not None:
  132.                     f.write("%s\n" % item.toHTML())
  133.                 else:
  134.                     f.write("<b>Empty Item!!!!</b>")
  135.                
  136.             self.writeHTMLFooter(f)
  137.     def storeList(self):
  138.         print('storeList')
  139.         os.chdir(self.wd)
  140.         print(os.getcwd())
  141.         self.wd=os.getcwd()
  142.         pickle.dump(self.items,open( "list.p", "wb" ))
  143.     def loadList(self):
  144.         print('loadList')
  145.         os.chdir(self.wd)
  146.         print(self.wd)
  147.         print(os.getcwd())
  148.         self.items=pickle.load(open( "list.p", "rb" ))
  149.     def start_h3(self, attributes):
  150.         print('start_H3')
  151.         self.STATE='Started H3'
  152.         for name, value in attributes:
  153.             print(name+"="+value)
  154.             if (value == 'FOLDED') or (name == 'folded'):
  155.                 self.STATE='FOLDED'
  156.     def handle_data(self,data):
  157.         print('handleData')
  158.         print("self.STATE="+self.STATE)
  159.         if self.STATE=='FOLDED':
  160.             #dirname = urllib.quote(data, safe='')
  161.             #dirname2= urllib.quote(dirname, safe='')
  162.             self.NotifyHeaderWatchers(data)
  163.             self.items.append(HTML_Link(href=dirname2+"/index.html",linkType='FOLDER',text=data))
  164.             self.storeList()
  165.             self.wd=os.path.join(self.wd,dirname)
  166.             print("self.wd="+self.wd)
  167.             #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
  168.             if not os.path.exists(self.wd):
  169.                 print("Making directory "+self.wd)
  170.                 os.makedirs(self.wd)
  171.             print("Entering:" + str(self.wd))
  172.             self.STATE="Seeking DL"
  173.  
  174.         if self.STATE=="A":
  175.            
  176.             self.A.text=data
  177.         if self.STATE=="DD":
  178.             self.items.append(Section(data.split('\n')[0]))
  179.     def end_h3(self): #Probably redundant
  180.         print('end_H3')
  181.         self.FOLDED=False        
  182.     def start_dl(self, atributes):
  183.         print('start_DL')
  184.         self.items=[]
  185.         print(self.DL)
  186.         print(self.wd)
  187.         self.DL.append(os.path.basename(self.wd)) #Maybe append the full path here instead of the basename
  188.     def end_dl(self):
  189.         print('end_DL')
  190.         self.writeList()
  191.         self.DL.pop
  192.         self.wd=os.path.join(self.wd,"..")
  193.         self.loadList()
  194.        
  195.     def start_a(self,atributes):
  196.         print('start_A')
  197.         self.A=HTML_Link()
  198.         for key,value in atributes:
  199.            setattr(self.A,key,value)
  200.         self.STATE='A'
  201.     def end_a(self):
  202.         print('end_A')
  203.         self.items.append(self.A)
  204.         self.A=None
  205.         self.STATE='Ended A'
  206.     def do_dd(self, atributes):
  207.         print('do_DD')
  208.         self.STATE="DD"
  209.     def do_dt(self, atributes):
  210.         print('do_DT')
  211.         self.STATE="DT"
  212. p = BookMarkParser()
  213. filename='/root/Downloads/pearltrees_export.html'
  214. f = open(filename, "r")
  215. BUFSIZE = 8192
  216. while True:
  217.     #data = f.read(BUFSIZE)
  218.     data=f.readline()
  219.     print('data='+str(data))
  220.     if not data: break
  221.     p.feed(data)
  222. p.close(  )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement