Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sgmllib, urllib, urlparse, os, pickle, md5
- class HTML_Link:
- def __init__(self,**kw):
- self.text=kw.get("text"," ")
- self.href=kw.get("href",'/')
- self.add_date=kw.get("add_date",'0')
- self.linkType=kw.get("linkType",None)
- def toHTML(self):
- if self.linkType is None:
- str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"
- return str
- elif self.linkType.upper()=="FOLDER":
- str='<A HREF="'+self.href+'">'+self.text+"</A>"
- return '<b>Folder:</b>'+str
- class Section:
- def __init__(self,text):
- self.text=text
- def toHTML(self):
- return "<H3>"+self.text+"</H3>"
- class SimpePathWatcher:
- def __init__(self,root):
- self.root=root
- self.bname=None
- def on_Header(self,bname):
- self.bname=bname
- def on_Enter(self):
- pass
- def on_Exit(self):
- pass
- def get_Dir():
- self.items[-1]['full']
- class DictList(list):
- def __init__(self,*args):
- super(DictList,self).__init__(args)
- def append(self,**kw):
- super(DictList,self).append(kw)
- alt=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else b
- intab = " _"
- outtab = "_~"
- trantab = maketrans(intab, outtab)
- def seq0(*args):
- for arg in args:
- arg()
- class DictList_For_EncodedPathWatcher(DictList):
- def __init__(self,*args):
- super(DictList,self).__init__(args)
- self.hasher=md5()
- self.updateHash=lambda hasher,data: seq0(lambda: hasher.update("\n"),
- lambda: hasher.update(data))
- self.getDigest=lambda hasher: hasher.digest()
- self.copyHash=lambda hasher: hasher.copy()
- self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='')
- self.sep=kw.get('sep','/')
- self.MAX_PATH_LEN=kw.get('MAX_PATH_LEN',200)#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
- self.MAX_NESTING=KW.GET('MAX_NESTING',0) #0 Means no limit on the amount of nexted directories.
- def set_HashFn(hasher=None,updateHash=None,getDigest=None):
- if hasher is not None: self.hasher=hasher
- if updateHash is not None: self.updateHash=updateHash
- if getDigest is not None: self.getDigest=getDigest
- if copyHash is not None: self.copyHash=copyHash
- return self
- def append(self,**kw):
- #alt(kw,'bname',self.updateBName(kw)) I think we need this
- if 'raw_name' not in d: #Rawname is the unencoded "folder name"
- kw['raw_name']=kw['bname']
- else:
- alt(kw,'bname',self.encoder(kw['raw_name'])
- kw['hasher']=alt(kw,'hasher',self.updateHash(self[-1]['hasher'],kw['raw_name'))
- kw['nesting']=alt(kw,'nesting',self[-1]+1)
- kw['full']=alt(kw,'full',self[-1].full+self.sep+kw['bname'])
- super(DictList,self).append(kw)
- def __updateMD5(hasher,kw):
- self.updateHash(self.hasher,'\n')
- self.updatehash(self.hasher,kw['bname'])
- kw['hasher']=self.copyHash(self.hasher,hasher)
- def rename(ind=-1,**kw):
- for key,value in kw
- self[-1][key]=value
- #TODO, add logic for if one gives the bname here without the fullname or visa-versa
- def pop():
- super(DictList,self).pop()
- self.hasher=self.copyHash(self[-1][hasher])
- def getFullname():
- return self[-1].full
- def
- class EncodedPathWatcher(SimplePathWatcher):
- def __init__(self,root,**kw):
- super(SimplePathWatcher,self).__init__(root)
- self.items=DictList_For_EncodedPathWatcher()
- #self.sep=kw.get('seperator','/')
- self.writePath,self.fname=kw.get('writePath',(False,encodepath.txt))
- #self.encoder=kw.get('encoder',self.defaultEncoder)
- #self.symLastCB=kw.get('symLast',defualtSymLast)
- #self.SymLastWhen=kw.get('SymLastWhen','SOMETIMES')
- #self.MAX_PATH_LEN=kw.get('MAX_PATH_LEN',200)#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
- #self.MAX_NESTING=KW.GET('MAX_NESTING',0) #0 Means no limit on the amount of nexted directories.
- self.hashFn=
- def on_Enter(self,bname=None,items=None,obj=None):
- if bname is None: bname=obj.bname #probably not useful and can be ommited
- if items is None: items=obj.items #probably not useful and can be ommited
- self.items.append(bname=self.bname)
- def pathFold(self,paths,**kw):
- maxNesting=kw.get('maxNesting',self.MAX_NESTING)
- root=kw.get('root',self.root)
- path=paths[-1].full: pathlen=len(path)
- nesting=paths[-1]['nesting']
- if (nesting<=maxNesting or maxNesting==0) and \
- (pathlen<=self.MAX_PATH_LEN):
- pass #No Folding Required.
- else
- mkLinks=kw.get('mkLinks',self.mkLinks)
- mkLinks(paths)
- #def mkLinks(self,last,dirname,m)
- def mkLinks(self,paths)
- #dig=m.digest()
- #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
- #self.mkLinks(last.full,dig,dirname)
- bname=dirname+"-id"+paths['hasher'].digest()
- new_fullpath=self.root+paths.sep+dirname2
- last_fullpath=paths[-2].full
- sep=paths.sep
- if not os.path.exists(newdir):
- print("Making directory "+new_fullpath)
- os.makedirs(new_fullpath)
- os.symlink(last_fullpath,new_fullpath+sep+"parent")
- os.symlink(newdir,last_fullpath+sep+bname)
- paths.rename(-1,bname=bname,full=new_fullpath)
- def on_Exit(self):
- self.items.pop()
- def encode(self,bname):
- return self.encoder(self.bname,self)
- def defaultEncoder(self,bname):
- return urllib.quote(self.bname, safe='')
- def defaultSymLast(
- class BookMarkParser(sgmllib.SGMLParser):
- def __init__(self,**kw):
- sgmllib.SGMLParser.__init__(self)
- self.STATE="__init__"
- self.DL=[]
- self.items=[]
- #self.wd=kw.get('wd','/root/Downloads/pt') #lets put this info in a watcher instead
- self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
- if watcher is None:
- self.watcher=EncodedPathWatcher(/root/Downloads/pt,
- pickle=True) #Pickle stores binary representation of times when leaving directory
- self.watchers=[self.watcher]
- if not os.path.exists(self.wd):
- print("Making directory "+self.wd)
- os.makedirs(self.wd)
- def writeHTMLHeader(self,f):
- print('WriteHTMLHeader')
- f.write("%s\n" % '<!DOCTYPE html>')
- f.write("%s\n" % '<html>')
- f.write("%s\n" % '<body>')
- def writeHTMLFooter(self,f):
- print('writeHTMLFooter')
- f.write("%s\n" % '</body>')
- f.write("%s\n" % '</html>')
- def writeList(self):
- print('writeList')
- with open('index.html', 'w') as f:
- self.writeHTMLHeader(f)
- for item in self.items:
- if item is not None:
- f.write("%s\n" % item.toHTML())
- else:
- f.write("<b>Empty Item!!!!</b>")
- self.writeHTMLFooter(f)
- def NotifyHeaderWatchers(data)
- for aWatcher in self.watchers:
- aWatcher.onHeader(data,self)
- #The watcher might not use the callback self.storeList (e.g. storring stuff on a remote directory)
- #self.watcher.storeList(self.items,lambda x: self.storeList(x))
- def start_h3(self, attributes):
- print('start_H3')
- self.STATE='Started H3'
- for name, value in attributes:
- print(name+"="+value)
- if (value == 'FOLDED') or (name == 'folded'):
- self.STATE='FOLDED'
- def handle_data(self,data):
- print('handleData')
- print("self.STATE="+self.STATE)
- if self.STATE=='FOLDED':
- #dirname = urllib.quote(data, safe='')
- #dirname2= urllib.quote(dirname, safe='')
- self.NotifyHeaderWatchers(data)
- #self.items.append(HTML_Link(href=dirname2+"/index.html",linkType='FOLDER',text=data))
- self.wd=os.path.join(self.wd,dirname)
- print("self.wd="+self.wd)
- #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
- if not os.path.exists(self.wd):
- print("Making directory "+self.wd)
- os.makedirs(self.wd)
- print("Entering:" + str(self.wd))
- self.STATE="Seeking DL"
- if self.STATE=="A":
- self.A.text=data
- if self.STATE=="DD":
- self.items.append(Section(data.split('\n')[0]))
- def end_h3(self): #Probably redundant
- print('end_H3')
- self.FOLDED=False
- def start_dl(self, atributes):
- print('start_DL')
- for item in self.items
- item.on_enter(lambda dir: self.storeList(dir))
- self.storeList()
- self.items=[]
- #print(self.DL)
- #print(self.wd)
- self.enterDir()
- #
- def storeList(self,dir):
- print('storeList')
- os.chdir(dir)
- print(os.getcwd())
- pickle.dump(self.items,open( "list.p", "wb" ))
- def enterDir():
- for watcher in self.watchers:
- watcher.on_Enter(bname,self)
- #self.DL.append(os.path.basename(self.wd)) #Maybe append the full path here instead of the basename
- def end_dl(self):
- print('end_DL')
- self.writeList()
- self.DL.pop
- self.wd=os.path.join(self.wd,"..")
- self.loadList()
- def loadList(self):
- print('loadList')
- os.chdir(self.wd)
- print(self.wd)
- print(os.getcwd())
- self.items=pickle.load(open( "list.p", "rb" ))
- def start_a(self,atributes):
- print('start_A')
- self.A=HTML_Link()
- for key,value in atributes:
- setattr(self.A,key,value)
- self.STATE='A'
- def end_a(self):
- print('end_A')
- self.items.append(self.A)
- self.A=None
- self.STATE='Ended A'
- def do_dd(self, atributes):
- print('do_DD')
- self.STATE="DD"
- def do_dt(self, atributes):
- print('do_DT')
- self.STATE="DT"
- p = BookMarkParser()
- filename='/root/Downloads/pearltrees_export.html'
- f = open(filename, "r")
- BUFSIZE = 8192
- while True:
- #data = f.read(BUFSIZE)
- data=f.readline()
- print('data='+str(data))
- if not data: break
- p.feed(data)
- p.close( )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement