Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class SqliteUnicode :
- #
- def __init__(self) :
- self.collation = { u"a" : u"àáâãäåą",
- u"e" : u"eèéêëęε",
- u"i" : u"iìíîï",
- u"o" : u"òóôõöøő",
- u"u" : u"uûü",
- u"c" : u"cç",
- u"l" : u"ł",
- u"n" : u"ñńŋ",
- u"r" : u"ř",
- u"s" : u"sš",
- u"y" : u"ýÿ",
- u"z" : u"źżž",
- u"ae" : u"æÆ",
- u"oe" : u"œŒ"
- }
- self.collation_ci = {}
- self.reg_exp = {}
- for key, value in self.collation.iteritems() :
- for letter in value :
- self.collation_ci[letter] = value + value.upper()
- self.remove_accents = copy.deepcopy(self.collation)
- for key, value in remove_accents1.iteritems() :
- for letter in value :
- self.remove_accents[letter] = key
- # remove some other characters
- for letter in u"()[]<>…" :
- self.remove_accents[letter] = ""
- # regular expressions
- # We compile them to improve performance
- # These ones are used by collate
- self.re1 = re.compile(u"[àâä]")
- self.re2 = re.compile(u"[éèêë]")
- self.re3 = re.compile(u"[îï]")
- self.re4 = re.compile(u"[ôö]")
- self.re5 = re.compile(u"[ûü]")
- def replace_accents(self, string) :
- # This function will replace accentuated characters to allow a friendly like
- # e will find éèê etc. oe will find œ
- string2 = ""
- for letter in string :
- if letter in self.remove_accents :
- string2 += self.remove_accents[letter]
- else :
- string2 += letter
- return string2
- def convert_to_regex(self, data) :
- c = ""
- if data in self.reg_exp :
- return self.reg_exp[data]
- reg_string = "(?ims)" # create the regular expression
- for c in data : # divide string in letters
- if c in "*.[]$()" : # escape special characters
- # TODO : are there others to escape ? What about % and ?
- reg_string += "\\" + c
- elif c == "%" :
- reg_string += ".*"
- elif c in self.collation_ci :
- reg_string += "[" + self.collation_ci[c] + "]" # using collation_ci is not useful with (?i)
- # It should not, but presently it is for an unknown reason
- else :
- reg_string += c
- if c != "%" : # if the last character is not %
- reg_string += "$" # necessary to match only the whole string
- try :
- comp_reg = re.compile(reg_string)
- except re.error as e :
- print "compile error", reg_string
- self.reg_exp[data] = comp_reg
- return comp_reg
- def like(self, a, b) :
- # On a 80000 rows table :
- # original like takes 0.032s
- # this like takes 0.25 s (about 8x)
- #print a,b
- try :
- if b == None or a == None :
- return False
- data = self.convert_to_regex(a)
- if data.match(b) :
- return True
- else :
- return False
- except :
- print "echec de like : ", a, "/", b
- utils.printExcept()
- return False
- def like2(self, a, b) :
- # On a 80000 rows table :
- # original like takes 0.032s
- # this like takes 0.25 s (about 8x)
- ## print a,b
- try :
- if b == None or a == None :
- return False
- a = self.replace_accents(a.lower())
- b = self.replace_accents(b.lower())
- a= a.replace("%", "")
- if a == b[0:len(a)] :
- return True
- else :
- return False
- except :
- print "echec de like : ", a, "/", b
- utils.printExcept()
- return False
- def regexp(self,a,b) :
- if b == None :
- return False
- if re.match(a,b) :
- return True
- else :
- return False
- def collate(self,string1, string2):
- string1 = unicode(string1,"utf_8")
- string2 = unicode(string2,"utf_8")
- string1 = string1.lower()
- string2 = string2.lower()
- string1 = self.re1.sub("a",string1)
- string1 = self.re2.sub("e",string1)
- string1 = self.re3.sub("i",string1)
- string1 = self.re4.sub("o",string1)
- string1 = self.re5.sub("u",string1)
- string2 = self.re1.sub("a",string2)
- string2 = self.re2.sub("e",string2)
- string2 = self.re3.sub("i",string2)
- string2 = self.re4.sub("o",string2)
- string2 = self.re5.sub("u",string2)
- compare = cmp(string1, string2)
- return compare
- def concat_ws(self, *args) :
- separator = args[0]
- data1 = args[1:]
- x= []
- # Remove None values
- for a in data1 :
- if a :
- x.append(a)
- return separator.join(x)
- def clean_commas(self, data1) :
- if data1 :
- data2 = re.findall("[0-9]+",data1)
- data3 =",".join(data2)
- return data3
- else :
- return data1
- def date2year(self,date1) :
- if date1 == None :
- return date1
- date1 =date1.split(",")[0] # Delete all after a comma
- date2 = date1.split("-")
- if len(date2) < 2 :
- date2 = date1.split("/")
- if len(date2) == 3 :
- return date2[2]
- else :
- return date1
- def date2ymd(self,date1) :
- # converts dmy format to ymd to allow sorting
- # The job could be done with a regex
- separator = "-"
- if date1 == None :
- return date1
- date2 = date1.split("-")
- if len(date2) < 3 :
- date2 = date1.split("/")
- separator = "/"
- if len(date2) == 3 :
- if len(date2[0]) == 1 :
- date2[0] = "0" + date2[0]
- if len(date2[1]) == 1 :
- date2[1] = "0" + date2[1]
- return date2[2] + separator + date2[1] + separator + date2[0]
- else :
- return date1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement