Advertisement
Guest User

mmapped csv

a guest
Nov 23rd, 2014
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.78 KB | None | 0 0
  1. import csv
  2. import codecs
  3. import cStringIO
  4.  
  5. class CsvIter(Object):
  6.  
  7.     def __init__(self, csvFileName):
  8.    
  9.         self.csvFileName = csvFileName
  10.         self.fileEncoding_ = self._get_encoding(self.csvFileName)
  11.         self.csvfile = codecs.open(self.csvFileName, "r+", encoding="utf-8")
  12.         self.data = self.mapfile(self.csvfile)
  13.         self.lookup = self._get_row_lookup(self.data)
  14.  
  15.     def __getitem__(self, key):
  16.         """return an item from a memory-mapped csv file"""
  17.         try:
  18.             if key == 0:
  19.                 start, end = 0, self.lookup[key]
  20.             else:
  21.                 start, end = self.lookup[key - 1], self.lookup[key]
  22.         except KeyError:
  23.             raise IndexError("index out of range")
  24.         return next(csv.reader(cStringIO.StringIO(self.data[start: end])))
  25.  
  26.     def mapfile(self, fileObj):
  27.         size = os.path.getsize(fileObj.name)
  28.         return mmap.mmap(fileObj.fileno(), size)
  29.  
  30.     def _get_row_lookup(self, data):
  31.         lino, record_start, lookup = 0, 0, {}
  32.         while True:
  33.             line = data.readline()
  34.             record_start += len(line)
  35.             lookup[lino] = record_start
  36.             lino += 1
  37.             if not line:
  38.                 break
  39.         return lookup
  40.  
  41.     # source: https://docs.python.org/2/library/csv.html (bottom of page)
  42.     def unicode_csv_reader(self, unicode_csv_data, dialect, encoding, **kwargs):
  43.         csv_reader = csv.reader(self.utf_8_encoder(unicode_csv_data, encoding),
  44.                                 dialect=dialect, **kwargs)
  45.         for row in csv_reader:
  46.             yield [unicode(cell, encoding) for cell in row]
  47.  
  48.     def utf_8_encoder(self, unicode_csv_data, encoding):
  49.         for line in unicode_csv_data:
  50.             yield line.encode(encoding)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement