Advertisement
Guest User

Untitled

a guest
Apr 21st, 2019
169
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Nim 4.68 KB | None | 0 0
  1. import parsecsv, matrix, strutils
  2. export matrix
  3.  
  4. type
  5.    DType = enum f32, f64, i32, i64, str
  6.    DFCol = object
  7.       name: string
  8.       dtypeCounts: array[DType, int]
  9.       case dtype: DType
  10.       of f32: f32s: seq[float32]
  11.       of f64: f64s: seq[float64]
  12.       of i32: i32s: seq[int32]
  13.       of i64: i64s: seq[int64]
  14.       of str: strs: seq[string]
  15.    DF = object
  16.       cols: seq[DFCol]
  17.    DFError = object of Exception
  18.    CsvParams = object
  19.       sep*, quote*, escape*: char
  20.       skipInitialSpace*, hasHeader*: bool
  21.    # CsvInfo = object
  22.    #    rows*, columns*: int
  23.    #    header*: seq[string]
  24.    # Csv = object
  25.    #    params: CsvParams
  26.    #    info: CsvInfo
  27.    #    rows: seq[seq[string]]
  28.  
  29. template dfAssert(cond, msg) =
  30.    if not cond:
  31.       {.line: instantiationInfo().}:
  32.          raise newException(DFError, msg)
  33.  
  34. const
  35.    axisCol* = 0
  36.    axisRow* = 1
  37.  
  38. proc dtypeOf*(T: typedesc): DType =
  39.    when T is float32: f32
  40.    elif T is float64: f64
  41.    elif T is int32: i32
  42.    elif T is int64: i64
  43.    elif T is string: str
  44.    else: static: doAssert false
  45.  
  46. proc len*(col: DFCol): int =
  47.    case col.dtype
  48.    of f32: col.f32s.len
  49.    of f64: col.f64s.len
  50.    of i32: col.i32s.len
  51.    of i64: col.i64s.len
  52.    of str: col.strs.len
  53.  
  54. proc initCsvParams*(
  55.       sep = ',',
  56.       quote = '\"',
  57.      escape = '\x00',
  58.      skipInitialSpace = false,
  59.      hasHeader = false,
  60.      ): CsvParams =
  61.   CsvParams(
  62.      sep: sep,
  63.      quote: quote,
  64.      escape: escape,
  65.      skipInitialSpace: skipInitialSpace,
  66.      hasHeader: hasHeader)
  67.  
  68. proc initDFCol*(dtype: static DType, size = 0, name = ""): DFCol =
  69.   when dtype == f32:
  70.      DFCol(dtype: dtype, name: name, f32s: newSeq[float32](size))
  71.   elif dtype == f64:
  72.      DFCol(dtype: dtype, name: name, f64s: newSeq[float64](size))
  73.   elif dtype == i32:
  74.      DFCol(dtype: dtype, name: name, i32s: newSeq[int32](size))
  75.   elif dtype == i64:
  76.      DFCol(dtype: dtype, name: name, i64s: newSeq[int64](size))
  77.   elif dtype == str:
  78.      DFCol(dtype: dtype, name: name, strs: newSeq[string](size))
  79.  
  80. proc parseDType(s: string): DType =
  81.   try:
  82.      let i = parseBiggestInt(s)
  83.      result = if i >= low(int32) and i <= high(int32): i32 else: i64
  84.   except ValueError:
  85.      try:
  86.         let f = parseFloat(s)
  87.         result = if f >= low(float32) and f <= high(float32): f32 else: f64
  88.      except ValueError:
  89.         result = str
  90.  
  91. proc readCsv*(filename: string, params: CsvParams): DF =
  92.   var parser: CsvParser
  93.   parser.open(
  94.      filename,
  95.      params.sep,
  96.      params.quote,
  97.      params.escape,
  98.      params.skipInitialSpace)
  99.   defer: parser.close()
  100.   if params.hasHeader:
  101.      dfAssert(parser.readRow(), "Failed to read header row.")
  102.      for column in parser.row:
  103.         result.cols.add(initDFCol(str, name = column))
  104.   while parser.readRow(result.cols.len):
  105.      if unlikely(result.cols.len == 0):
  106.         for column in parser.row:
  107.            result.cols.add(initDFCol(str))
  108.      for i, s in parser.row:
  109.         result.cols[i].dtypeCounts[parseDType(s)] += 1
  110.         result.cols[i].strs.add(s)
  111.  
  112. proc names*(df: DF): seq[string] =
  113.   for col in df.cols:
  114.      result.add(col.name)
  115.  
  116. proc shape*(df: DF): (int, int) =
  117.   if df.cols.len == 0: (0, 0) else: (df.cols[0].len, df.cols.len)
  118.  
  119. proc dtypeCounts*(df: DF): seq[array[DType, int]] =
  120.   for col in df.cols:
  121.      result.add(col.dtypeCounts)
  122. # proc filter(row: seq[string], ignoreCols: openarray[int]): seq[string] =
  123. #    for i in 0..<row.len:
  124. #       if i notin ignoreCols:
  125. #          result.add(row[i])
  126.  
  127. # proc readCsvInfo*(
  128. #       filename: string,
  129. #       params: CsvParams,
  130. #       ignoreCols: openarray[int] = []
  131. #       ): CsvInfo =
  132. #    initCsv(filename, params, false, ignoreCols).info
  133.  
  134. # proc parseAs(str: string, T: typedesc[SomeFloat]): T = parseFloat(str)
  135. # proc parseAs(str: string, T: typedesc[SomeInteger]): T = parseInt(str)
  136. # proc parseAs(str: string, T: typedesc[string]): T = str
  137.  
  138. # proc readCsv*[T](
  139. #       filename: string,
  140. #       params: CsvParams,
  141. #       ignoreCols: openarray[int] = []
  142. #       ): SimpleMatrix[T] =
  143. #    let csv = initCsv(filename, params, true, ignoreCols)
  144. #    result = newSimpleMatrix[T](csv.info.rows, csv.info.columns)
  145. #    for row in 0..<csv.info.rows:
  146. #       for col in 0..<csv.info.columns:
  147. #          result[row, col] = csv.rows[row][col].parseAs(T)
  148.  
  149. # proc readCsvCol*[T](
  150. #       filename: string,
  151. #       params: CsvParams,
  152. #       col: int
  153. #       ): seq[T] =
  154. #    var parser = openScopedParser(filename, params)
  155. #    if params.hasHeader: doAssert(parser.readRow(), "Csv header read failed.")
  156. #    while parser.readRow(): result.add(parser.row[col].parseAs(T))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement