Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sgmllib, urllib, urlparse, os, pickle, md5, base64
- import pdb #For debugging
- #https://stackoverflow.com/questions/3031045/how-come-string-maketrans-does-not-work-in-python-3-1
- from string import maketrans #Import might be slightly different in python 3
- class HTML_Link:
- def __init__(self,**kw):
- self.text=kw.get("text"," ")
- self.href=kw.get("href",'/')
- self.add_date=kw.get("add_date",'0')
- self.linkType=kw.get("linkType",None)
- def toHTML(self,endSep=''):
- if self.linkType is None:
- str='<A HREF="'+self.href+'" ADD_DATE="'+self.add_date+'">'+self.text+"</A>"+endSep
- return str
- elif self.linkType.upper()=="FOLDER":
- str='<A HREF="'+self.href+'">'+self.text+"</A>"
- return '<b>Folder:</b>'+str
- class Section:
- def __init__(self,text):
- self.text=text
- def toHTML(self,endSep=''):
- return "<H3>"+self.text+"</H3>"
- #https://stackoverflow.com/questions/9698614/super-raises-typeerror-must-be-type-not-classobj-for-new-style-class
- #https://stackoverflow.com/questions/9699591/instance-is-an-object-but-class-is-not-a-subclass-of-object-how-is-this-po/9699961#9699961
- class SimplePathWatcher(object):
- def __init__(self,root):
- self.root=root
- self.raw_data=None
- def on_Header(self,raw_data,obj=None,state=None):
- if state=="More":
- self.raw_data=self.raw_data+raw_data
- else:
- self.raw_data=raw_data
- def on_Enter(self):
- pass
- def on_Exit(self):
- pass
- def get_Dir():
- self.items[-1]['full']
- class DictList(list):
- def __init__(self,*args):
- super(DictList,self).__init__(args)
- def append(self,**kw):
- super(DictList,self).append(kw)
- alt_fn=lambda d,k,a: d.get(k,None) if d.get(k,None) is not None else a()
- def alt(d,k,a):
- d[k]=alt_fn(d,k,a)
- intab = " _"
- outtab = "_~"
- transtab = maketrans(intab, outtab)
- def seq0(*args):
- for arg in args:
- arg()
- def setPathDefaults(obj):
- obj.sep='/'
- obj.MAX_PATH_LEN=200#typically is is 255 for the maxium path length in linux but we want to leave some room for the filename.
- obj.MAX_NESTING=0 #0 Means no limit on the amount of nexted directories.
- def getCallDel(kw,key,default,args):
- fn=kw.get(key,default)
- if fn is not None:
- return fn(args)
- del kw[key]
- def setKwAttr(obj,kw):
- print(kw)
- #https://stackoverflow.com/questions/5466618/too-many-values-to-unpack-iterating-over-a-dict-key-string-value-list
- if hasattr(kw,'iteritems'): #Python 2 case
- itterator=lambda kw: kw.iteritems()
- else: #Python 3 case
- itterator=lambda kw: kw.items()
- print("kw="+str(kw))
- for k,v in itterator(kw):
- print("obj."+str(k)+"="+str(v))
- setattr(obj,k,v)
- #pdb.set_trace()
- def hasherDefulats():
- return {\
- 'hasher': md5.md5(),
- 'updateHash':lambda hasher,data: seq0(lambda: hasher.update("\n"),
- lambda: hasher.update(data)),
- 'getDigest': lambda hasher: base64.b64encode(hasher.digest(),'~-'),
- 'copyHash': lambda hasher: hasher.copy()}
- #
- def setHashWrapper(obj,kw,delete=True):
- defaults=hasherDefulats()
- for key in ('hasher','updateHash','getDigest','copyHash'):
- val=kw.get(key,None)
- if val is not None:
- setattr(obj,key,val)
- if delete==True:
- del kw[key]
- else:
- setattr(obj,key,defaults[key])
- class HashWrapper:
- def __init__(self,kw,delete=True):
- setHashWrapper(self,kw,delete=True)
- def update(self,data):
- return self.updateHash(self.hasher,data)
- def digest(self):
- return self.getDigest(self.hasher)
- def copy(self):
- kw={'hasher':self.copyHash(self.hasher),
- 'updateHash':self.updateHash,
- 'getDigest':self.getDigest,
- 'copyHash':self.copyHash}
- return HashWrapper(kw)
- class DictList_For_EncodedPathWatcher(DictList):
- def __init__(self,paths=[],**kw):
- super(DictList,self).__init__(paths)
- #setHasher(self)
- self.hasher=HashWrapper(kw,delete=True)
- if 'encoder' in kw.keys():
- self.encoder=kw['encoder']
- else:
- self.encoder=lambda raw_name: urllib.quote(raw_name.translate(transtab), safe='()?-,'+"'"+'"')
- #getCallDel(kw,'setHasher',setHasher,self)
- self.before_append=kw.get('before_append',lambda path: None)
- self.after_append=kw.get('after_append',lambda path: None)
- self.before_pop=kw.get('before_pop',lambda path: None)
- self.after_pop=kw.get('after_pop',lambda path: None)
- self.root=kw.get('root',"/root/Downloads/pt")
- getCallDel(kw,'setPathDefaults',setPathDefaults,self)
- setKwAttr(self,kw)
- for path in paths:
- self.append(path)
- def append(self,raw_name,**kw):
- #alt(kw,'bname',self.updateBName(kw)) I think we need this
- if raw_name is not None:
- kw['raw_name']=raw_name
- print("raw_name"+str(raw_name))
- alt(kw,'bname',lambda: self.encoder(kw['raw_name']))
- if len(self)>0:
- hasher=self[-1]['hasher'].copy()
- alt(kw,'hasher',lambda: hasher)
- alt(kw,'nesting',lambda: self[-1]['nesting']+1)
- alt(kw,'full',lambda: self[-1]['full']+self.sep+kw['bname'])
- else:
- hasher=self.hasher.copy()
- alt(kw,'hasher',lambda: hasher)
- alt(kw,'nesting',lambda: 1)
- alt(kw,'full',lambda: self.root+self.sep+kw['bname'])
- #if kw['nesting']>1:
- folded=self.pathFold(kw)
- #else
- # folded=False
- if len(self)>0:
- self.before_append(self[-1]['full'])
- else:
- pass #TODO, need to think about this
- super(DictList,self).append(kw)
- if folded:
- self.mkLinks()
- self.on_append(self[-1]['full'])
- def rename(ind=-1,**kw):
- for key,value in kw:
- self[-1][key]=value
- #TODO, add logic for if one gives the bname here without the fullname or visa-versa
- def pop(self):
- self.before_pop(self[-1]['full']) #No point in return a result.
- super(DictList,self).pop()
- self.after_pop(self[-1]['full'])
- #return result #I don't think this does anytning useful.
- def getFullname():
- return self[-1].full
- def pathFold(self,kw):
- maxNesting=kw.get('maxNesting',self.MAX_NESTING)
- root=kw.get('root',self.root)
- path=kw['full']
- pathlen=len(path)
- nesting=kw['nesting']
- if (nesting<=maxNesting or maxNesting==0) and \
- (pathlen<=self.MAX_PATH_LEN):
- return False #No Folding Required.
- else:
- bname=kw['bname']
- hasher=kw['hasher']
- kw['bname']=bname+"-id"+hasher.digest()
- kw['full']=self.root+self.sep+kw['bname']
- return True
- #mkLinks=kw.get('mkLinks',self.mkLinks)
- #mkLinks(paths)
- def mkLinks(self):#new_fullpath,last_fullpath):
- #dig=m.digest()
- #dig2=base64.b64encode(s,'~-') #Alt chacters are tilda and dash like in freenet: https://github.com/freenet/wiki/wiki/Signed-Subspace-Key
- #self.mkLinks(last.full,dig,dirname)
- new_fullpath = self[-1]['full']
- last_fullpath=self[-2]['full']
- bname=self[-1]['bname']
- sep=self.sep
- if not os.path.exists(new_fullpath):
- print("Making directory "+new_fullpath)
- os.makedirs(new_fullpath)
- #TODO Possible bug producting duplicate paths
- if not os.path.exists(new_fullpath+sep+"parent"):
- os.symlink(last_fullpath,new_fullpath+sep+"parent")
- print(str(last_fullpath+sep+bname))
- if not os.path.exists(last_fullpath+sep+bname):
- os.symlink(new_fullpath,last_fullpath+sep+bname)
- #paths.rename(-1,bname=bname,full=new_fullpath)
- # def set_HashFn(hasher=None,updateHash=None,getDigest=None):
- # if hasher is not None: self.hasher=hasher
- # if updateHash is not None: self.updateHash=updateHash
- # if getDigest is not None: self.getDigest=getDigest
- # if copyHash is not None: self.copyHash=copyHash
- # return self
- class EncodedPathWatcher(SimplePathWatcher):
- def __init__(self,root,**kw):
- #https://stackoverflow.com/questions/11179008/python-inheritance-typeerror-object-init-takes-no-parameters
- super(EncodedPathWatcher,self).__init__(root)
- self.paths=DictList_For_EncodedPathWatcher([],**kw)
- def on_enter(self,raw_data=None,bname=None,items=None,obj=None):
- if raw_data is None:
- raw_data=self.raw_data
- self.paths.append(raw_data)
- #def mkLinks(self,last,dirname,m)
- def on_exit(self):
- self.paths.pop()
- def encode(self,bname):
- return self.encoder(self.bname,self)
- def defaultEncoder(self,bname):
- return urllib.quote(self.bname, safe='()?-,'+"'"+'"')
- #def defaultSymLast(
- def storeList(path,getItems):
- print('storeList')
- os.chdir(str(path))
- print(os.getcwd())
- pickle.dump(getItems(),open( "list.p", "wb" ))
- def mkdir(path):
- if not os.path.exists(str(path)):
- print("Making directory "+str(path))
- os.makedirs(str(path))
- def loadList(path,items):
- print('loadList')
- print('path='+str(path))
- os.chdir(str(path))
- print(str(path))
- print(os.getcwd())
- items=pickle.load(open( "list.p", "rb" ))
- class BookMarkParser(sgmllib.SGMLParser):
- def __init__(self,**kw):
- sgmllib.SGMLParser.__init__(self)
- self.STATE="__init__"
- self.items=[]
- self.watcher=kw.get('watcher',None) #This is the main watcher, we may add others
- if self.watcher is None:
- self.watcher=EncodedPathWatcher(root="/root/Downloads/pt",
- before_append=lambda path: storeList(path,lambda: self.items),
- on_append=lambda path: mkdir(path),
- before_pop=lambda path: HTMLWriter(self.items).writeList(path),
- after_pop=lambda path: loadList(path,self.items)
- )
- #self.watcher.enterCB=lambda dir: self.storeList(dir)
- self.watchers=[self.watcher]
- def NotifyHeaderWatchers(self,data):
- for aWatcher in self.watchers:
- aWatcher.on_Header(data,obj=self)
- def start_h3(self, attributes):
- print('start_H3')
- self.STATE='Started H3'
- for name, value in attributes:
- print(name+"="+value)
- if (value == 'FOLDED') or (name == 'folded'):
- self.STATE='FOLDED'
- def handle_data(self,data):
- print('handleData')
- print("self.STATE="+self.STATE)
- if self.STATE=='FOLDED':
- self.NotifyHeaderWatchers(data)
- #https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python
- self.STATE="Seeking DL"
- if self.STATE=="A":
- self.A.text=data
- if self.STATE=="DD":
- self.items.append(Section(data.split('\n')[0]))
- def end_h3(self): #Probably redundant
- print('end_H3')
- self.FOLDED=False
- def start_dl(self, atributes):
- if self.STATE!="__init__":
- print('start_DL')
- for watcher in self.watchers:
- watcher.on_enter()
- self.items=[]
- def end_dl(self):
- if self.STATE!="__init__":
- print('end_DL')
- for watcher in self.watchers:
- watcher.on_exit()
- def start_a(self,atributes):
- if self.STATE!="__init__":
- print('start_A')
- self.A=HTML_Link()
- for key,value in atributes:
- setattr(self.A,key,value)
- self.STATE='A'
- def end_a(self):
- if self.STATE!="__init__":
- print('end_A')
- self.items.append(self.A)
- self.A=None
- self.STATE='Ended A'
- def do_dd(self, atributes):
- if self.STATE!="__init__":
- print('do_DD')
- self.STATE="DD"
- def do_dt(self, atributes):
- if self.STATE!="__init__":
- print('do_DT')
- self.STATE="DT"
- class HTMLWriter:
- def __init__(self,items):
- self.items=items
- def writeHTMLHeader(self,f):
- print('WriteHTMLHeader')
- f.write("%s\n" % '<!DOCTYPE html>')
- f.write("%s\n" % '<html>')
- f.write("%s\n" % '<body>')
- def writeHTMLFooter(self,f):
- print('writeHTMLFooter')
- f.write("%s\n" % '</body>')
- f.write("%s\n" % '</html>')
- def writeList(self,path):
- print('writeList')
- os.chdir(str(path))
- with open('index.html', 'w') as f:
- self.writeHTMLHeader(f)
- for item in self.items:
- if item is not None:
- f.write("%s\n" % item.toHTML('<br>'))
- else:
- f.write("<b>Empty Item!!!!</b>")
- self.writeHTMLFooter(f)
- p = BookMarkParser()
- filename='/root/Downloads/pearltrees_export.html'
- f = open(filename, "r")
- BUFSIZE = 8192
- while True:
- #data = f.read(BUFSIZE)
- data=f.readline()
- print('data='+str(data))
- if not data: break
- p.feed(data)
- p.close( )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement