datasetAcquisition.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import numpy
  2. import os
  3. import sys
  4. import csv
  5. import scipy.io
  6. import pickle
  7. import time
  8. import datetime
  9. import glob
  10. import helperFunctions
  11. ###
  12. def readData(configFile):
  13. dataset = helperFunctions.getConfig(configFile, 'data', 'dataset', None, 'str', True)
  14. if dataset == 'uci':
  15. x,y = readUCIcsv(configFile)
  16. else:
  17. raise Exception('Unknown dataset %s!'%dataset)
  18. if x.shape[0] != y.shape[0]:
  19. raise Exception('#data = {} != #labels = {}'.format(x.shape[0],y.shape[0]))
  20. if not numpy.all(numpy.isfinite(x)):
  21. raise Exception('not numpy.all(numpy.isfinite(x))')
  22. if not numpy.all(numpy.isfinite(y)):
  23. raise Exception('not numpy.all(numpy.isfinite(y))')
  24. return x,y
  25. ###
  26. def readUCIcsv(configFile):
  27. dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True)
  28. labelCol = helperFunctions.getConfig(configFile, 'data', 'labelCol', None, 'int', True)
  29. forbiddenCols = helperFunctions.getConfig(configFile, 'data', 'forbiddenCols', None, 'intList', True)
  30. delimiter = helperFunctions.getConfig(configFile, 'data', 'delimiter', ',', 'str', True)
  31. quoteChar = helperFunctions.getConfig(configFile, 'data', 'quoteChar', '|', 'str', True)
  32. firstDataRowNumber = helperFunctions.getConfig(configFile, 'data', 'firstDataRowNumber', 1, 'int', True)
  33. normalizeFeatures = helperFunctions.getConfig(configFile, 'data', 'normalizeFeatures', -1, 'str', True)
  34. if delimiter == '':
  35. delimiter = ' '
  36. ###
  37. csvFile = open(dataFileName, 'rb')
  38. csvReader = csv.reader(csvFile, delimiter=delimiter, quotechar=quoteChar)
  39. idx = 0;
  40. x = None
  41. y = None
  42. for row in csvReader:
  43. xRow = []
  44. if idx < firstDataRowNumber:
  45. idx = idx + 1
  46. continue
  47. for colIdx in range(len(row)):
  48. if forbiddenCols is not None and colIdx in forbiddenCols:
  49. continue
  50. elif colIdx == labelCol:
  51. yRow = float(row[colIdx])
  52. else:
  53. xRow.append(float(row[colIdx]))
  54. if x is None:
  55. x = numpy.asmatrix(xRow, dtype=numpy.float)
  56. else:
  57. x = numpy.append(x, numpy.asmatrix(xRow), axis=0)
  58. if y is None:
  59. y = numpy.asmatrix(yRow, dtype=numpy.float)
  60. else:
  61. y = numpy.append(y, numpy.asmatrix(yRow), axis=0)
  62. csvFile.close()
  63. if normalizeFeatures == 'uci':
  64. x = helperFunctions.normalizeUCI(x)
  65. elif int(normalizeFeatures) > 0:
  66. x = helperFunctions.normalizeLP(x, int(normalizeFeatures))
  67. return x,y