Advertisement
Guest User

Untitled

a guest
Dec 31st, 2017
220
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.54 KB | None | 0 0
  1. # run=0: Choose not run h2oaiglm and just pass through all other cells
  2. # run=1: Run h2oaiglm without animation
  3. # Requirements: python3.5
  4. run=1
  5.  
  6. # anim=0: Don't show animation
  7. # anim=1: Do show animation
  8. # pip install pandas psutil matplotlib --user
  9. # pip install -e git+https://github.com/fbcotter/py3nvml#egg=py3nvml --user
  10. anim=1
  11.  
  12. PWD = !pwd
  13.  
  14.  
  15.  
  16. import sys
  17. import os.path
  18. from pprint import pprint
  19. import warnings
  20. warnings.filterwarnings('ignore')
  21.  
  22.  
  23. import pymapd
  24.  
  25. con = pymapd.connect(user="mapd", password="HyperInteractive", host="localhost",dbname="mapd")
  26. con
  27.  
  28. columns = """
  29. INCEARN,RECTYPE,IPUMS_YEAR,DATANUM,SERIAL,NUMPREC,SUBSAMP,HHWT,HHTYPE,REPWT,ADJUST,CPI99,REGION,STATEICP,STATEFIP,COUNTY,COUNTYFIPS,METRO,METAREA,METAREAD,MET2013,MET2013ERR,CITY,CITYERR,CITYPOP,PUMA,PUMARES2MIG,STRATA,PUMASUPR,CONSPUMA,CPUMA0010,APPAL,APPALD,HOMELAND,MET2003,CNTRY,GQ,GQTYPE,GQTYPED,FARM,OWNERSHP,OWNERSHPD,MORTGAGE,MORTGAG2,COMMUSE,FARMPROD,ACREHOUS,MORTAMT1,MORTAMT2,TAXINCL,INSINCL,PROPINSR,PROPTX99,OWNCOST,RENT,RENTGRS,RENTMEAL,CONDOFEE,MOBLHOME,MOBLHOM2,MOBLOAN,SECRES,SECRESMO,SECRESRE,COSTELEC,COSTGAS,COSTWATR,COSTFUEL,PUBHOUS,RENTSUB,HEATSUB,LUNCHSUB,FOODSTMP,FDSTPAMT,VALUEH,LINGISOL,VACANCY,KITCHEN,KITCHENORIG,FRIDGE,FRIDGEORIG,SINK,STOVE,ROOMS,ROOMSORIG,PLUMBING,HOTWATER,SHOWER,TOILET,BUILTYR,BUILTYR2,UNITSSTR,BEDROOMS,BEDROOMSORIG,PHONE,PHONEORIG,CILAPTOP,CIHAND,CIOTHCOMP,CINETHH,CIMODEM,CISAT,CIDSL,CIFIBER,CIBRDBND,CIDIAL,CIOTHSVC,FUELHEAT,VEHICLES,SSMC,NFAMS,NSUBFAM,NCOUPLES,NMOTHERS,NFATHERS,MULTGEN,MULTGEND,CBNSUBFAM,REPWT1,REPWT2,REPWT3,REPWT4,REPWT5,REPWT6,REPWT7,REPWT8,REPWT9,REPWT10,REPWT11,REPWT12,REPWT13,REPWT14,REPWT15,REPWT16,REPWT17,REPWT18,REPWT19,REPWT20,REPWT21,REPWT22,REPWT23,REPWT24,REPWT25,REPWT26,REPWT27,REPWT28,REPWT29,REPWT30,REPWT31,REPWT32,REPWT33,REPWT34,REPWT35,REPWT36,REPWT37,REPWT38,REPWT39,REPWT40,REPWT41,REPWT42,REPWT43,REPWT44,REPWT45,REPWT46,REPWT47,REPWT48,REPWT49,REPWT50,REPWT51,REPWT52,REPWT53,REPWT54,REPWT55,REPWT56,REPWT57,REPWT58,REPWT59,REPWT60,REPWT61,REPWT62,REPWT63,REPWT64,REPWT65,REPWT66,REPWT67,REPWT68,REPWT69,REPWT70,REPWT71,REPWT72,REPWT73,REPWT74,REPWT75,REPWT76,REPWT77,REPWT78,REPWT79,REPWT80,RESPMODE,PERNUM,PERWT,SLWT,REPWTP,FAMSIZE,NCHILD,NCHLT5,FAMUNIT,ELDCH,YNGCH,NSIBS,MOMLOC,STEPMOM,MOMRULE,POPLOC,STEPPOP,POPRULE,SPLOC,SPRULE,SUBFAM,SFTYPE,SFRELATE,CBSUBFAM,CBSFTYPE,CBSFRELATE,RELATE,RELATED,SEX,AGE,AGEORIG,BIRTHQTR,MARST,BIRTHYR,MARRNO,MARRINYR,YRMARR,DIVINYR,WIDINYR,FERTYR,RACE,RACED,HISPAN,HISPAND,BPL,BPLD,ANCESTR1,ANCESTR1D,ANCESTR2,ANCESTR2D,CITIZEN,YRNATUR,YRIMMIG,YRSUSA1,YRSUSA2,SPOKEN_LANGUAGE,LANGUAGED,SPEAKENG,TRIBE,TRIBED,RACESING,RACESINGD,RACAMIND,RACASIAN,RACBLK,RACPACIS,RACWHT,RACOTHER,RACNUM,SCHOOL,EDUC,EDUCD,GRADEATT,GRADEATTD,SCHLTYPE,DEGFIELD,DEGFIELDD,DEGFIELD2,DEGFIELD2D,EMPSTAT,EMPSTATD,LABFORCE,OCC,OCC1950,OCC1990,OCC2010,IND,IND1950,IND1990,CLASSWKR,CLASSWKRD,OCCSOC,INDNAICS,WKSWORK1,WKSWORK2,UHRSWORK,WRKLSTWK,ABSENT,LOOKING,AVAILBLE,WRKRECAL,WORKEDYR,POVERTY,OCCSCORE,SEI,HWSEI,PRESGL,PRENT,ERSCOR50,ERSCOR90,EDSCOR50,EDSCOR90,NPBOSS50,NPBOSS90,MIGRATE1,MIGRATE1D,MIGPLAC1,MIGMET1,MIGTYPE1,MIGCITY1,MIGPUMS1,MIGPUMA1,MOVEDIN,MOVEDINORIG,DISABWRK,VETDISAB,DIFFREM,DIFFPHYS,DIFFMOB,DIFFCARE,DIFFSENS,DIFFEYE,DIFFHEAR,VETSTAT,VETSTATD,VET01LTR,VET95X00,VET90X01,VET90X95,VET75X90,VET80X90,VET75X80,VETVIETN,VET55X64,VETKOREA,VET47X50,VETWWII,VETOTHER,VETYRS,PWSTATE2,PWMETRO,PWCITY,PWTYPE,PWPUMA00,PWPUMAS,TRANWORK,CARPOOL,RIDERS,TRANTIME,DEPARTS,ARRIVES,GCHOUSE,GCMONTHS,GCRESPON,PROBAI,PROBAPI,PROBBLK,PROBOTH,PROBWHT,REPWTP1,REPWTP2,REPWTP3,REPWTP4,REPWTP5,REPWTP6,REPWTP7,REPWTP8,REPWTP9,REPWTP10,REPWTP11,REPWTP12,REPWTP13,REPWTP14,REPWTP15,REPWTP16,REPWTP17,REPWTP18,REPWTP19,REPWTP20,REPWTP21,REPWTP22,REPWTP23,REPWTP24,REPWTP25,REPWTP26,REPWTP27,REPWTP28,REPWTP29,REPWTP30,REPWTP31,REPWTP32,REPWTP33,REPWTP34,REPWTP35,REPWTP36,REPWTP37,REPWTP38,REPWTP39,REPWTP40,REPWTP41,REPWTP42,REPWTP43,REPWTP44,REPWTP45,REPWTP46,REPWTP47,REPWTP48,REPWTP49,REPWTP50,REPWTP51,REPWTP52,REPWTP53,REPWTP54,REPWTP55,REPWTP56,REPWTP57,REPWTP58,REPWTP59,REPWTP60,REPWTP61,REPWTP62,REPWTP63,REPWTP64,REPWTP65,REPWTP66,REPWTP67,REPWTP68,REPWTP69,REPWTP70,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
  30. """.strip()
  31. print(len(columns.split(',')))
  32.  
  33.  
  34. #query = "SELECT {} FROM ipums_easy WHERE INCEARN > 100;".format(columns)
  35. # ensure sql query is deterministic
  36. query = "SELECT {} FROM ipums_easy WHERE INCEARN > 100 order by SERIAL;".format(columns)
  37. print('Query is : ' + query)
  38.  
  39. # always use True for is columnar
  40. df = con.select_ipc_gpu(query)
  41.  
  42. type(df)
  43.  
  44.  
  45. df.head().to_pandas()
  46.  
  47.  
  48. import numpy as np
  49.  
  50. df['INCEARN'].mean()
  51.  
  52. num_cols = set()
  53. cat_cols = set()
  54. response_set = set(['INCEARN'])
  55. feature_names = set(df.columns) - response_set
  56.  
  57.  
  58. uniques = {}
  59. for k in feature_names:
  60. try:
  61. uniquevals = df[k].unique_k(k=1000)
  62. uniques[k] = uniquevals
  63. except ValueError:
  64. # more than 1000 unique values
  65. num_cols.add(k)
  66. else:
  67. # within 1000 unique values
  68. nunique = len(uniquevals)
  69. if nunique < 2:
  70. del df[k] # drop constant column
  71. elif 1 < nunique < 1000:
  72. cat_cols.add(k) # as cat column
  73. else:
  74. num_cols.add(k) # as num column
  75.  
  76.  
  77.  
  78. for k in (num_cols - response_set):
  79. df[k] = df[k].fillna(df[k].mean())
  80. assert df[k].null_count == 0
  81. std = df[k].std()
  82. # drop near constant columns
  83. if not np.isfinite(std) or std < 1e-4:
  84. del df[k]
  85. print('drop near constant', k)
  86. else:
  87. df[k] = df[k].scale()
  88.  
  89.  
  90.  
  91. for k in cat_cols:
  92. cats = uniques[k][1:] # drop first
  93. df = df.one_hot_encoding(k, prefix=k, cats=cats)
  94. del df[k]
  95.  
  96.  
  97.  
  98. nrows=len(df)
  99. print(nrows)
  100. df['intercept'] = np.ones(nrows, dtype=np.float64)
  101.  
  102.  
  103.  
  104.  
  105. df['INCEARN'] = df['INCEARN'].astype(np.float64)
  106.  
  107.  
  108.  
  109.  
  110.  
  111. df.dtypes
  112.  
  113.  
  114. # Fraction for train (test is 1-FRACTION)
  115. FRACTION=0.8
  116. validFraction=1.0-FRACTION
  117. n60 = int(len(df) * FRACTION)
  118. print('60% of {} is {}'.format(len(df), n60))
  119. train_df = df.loc[:n60]
  120. if FRACTION<1.0:
  121. test_df = df.loc[n60:]
  122. print('train_df has {} rows | test_df has {} rows'.format(len(train_df), len(test_df)))
  123. else:
  124. print('train_df has {} rows | test_df has {} rows'.format(len(train_df), 0))
  125.  
  126.  
  127.  
  128.  
  129. train_data_mat = train_df.as_gpu_matrix(columns=df.columns[1:])
  130. train_result_mat = train_df.as_gpu_matrix(columns=[df.columns[0]])
  131. if FRACTION<1.0:
  132. test_data_mat = test_df.as_gpu_matrix(columns=df.columns[1:])
  133. test_result_mat = test_df.as_gpu_matrix(columns=[df.columns[0]])
  134.  
  135.  
  136.  
  137. print(train_df['INCEARN'].mean())
  138. if FRACTION<1.0:
  139. print(test_df['INCEARN'].mean())
  140.  
  141.  
  142.  
  143. print(train_data_mat.shape)
  144. print(train_result_mat.shape)
  145. if FRACTION<1.0:
  146. print(test_data_mat.shape)
  147. print(test_result_mat.shape)
  148.  
  149.  
  150. train_data_mat_ptr = train_data_mat.device_ctypes_pointer
  151. train_result_mat_ptr = train_result_mat.device_ctypes_pointer
  152. print('train_data_mat_ptr address', hex(train_data_mat_ptr.value))
  153. print('train_result_mat_ptr address', hex(train_result_mat_ptr.value))
  154. if FRACTION<1.0:
  155. test_data_mat_ptr = test_data_mat.device_ctypes_pointer
  156. test_result_mat_ptr = test_result_mat.device_ctypes_pointer
  157. print('test_data_mat_ptr address', hex(test_data_mat_ptr.value))
  158. print('test_result_mat_ptr address', hex(test_result_mat_ptr.value))
  159.  
  160.  
  161.  
  162.  
  163.  
  164.  
  165. import os
  166. os.getcwd()
  167.  
  168.  
  169.  
  170. # Load H2OAIGLM
  171. import h2oaiglm as h2oaiglm
  172. from ctypes import *
  173. import time
  174. if anim==1:
  175. import pandas as pd
  176.  
  177. a=c_void_p(train_data_mat_ptr.value)
  178. b=c_void_p(train_result_mat_ptr.value)
  179. if FRACTION<1.0:
  180. c=c_void_p(test_data_mat_ptr.value)
  181. d=c_void_p(test_result_mat_ptr.value)
  182. else:
  183. c=c_void_p(0)
  184. d=c_void_p(0)
  185.  
  186.  
  187.  
  188. def new_alpha(row_fold):
  189. if row_fold == 0:
  190. return -0.025
  191. elif row_fold == 1:
  192. return -0.05
  193. elif row_fold == 3:
  194. return 0.025
  195. elif row_fold == 4:
  196. return 0.05
  197. else: return 0
  198.  
  199. def plot_cpu_perf(axis, cpu_labels, cpu_snapshot):
  200. axis.cla()
  201. axis.grid(False)
  202. axis.set_ylim([0,100])
  203. axis.set_ylabel('Percent', labelpad=2, fontsize = 14)
  204. axis.bar(cpu_labels, cpu_snapshot, color='dodgerblue')
  205. axis.set_title('CPU Utilization', fontsize = 16)
  206.  
  207. def plot_gpu_perf(axis, gpu_labels, gpu_snapshot):
  208. axis.cla()
  209. axis.grid(False)
  210. axis.set_ylim([0,100])
  211. axis.set_xticks(gpu_labels)
  212. axis.set_ylabel('Percent', labelpad=2, fontsize = 14)
  213. axis.bar(gpu_labels, gpu_snapshot, width =0.5, color = 'limegreen')
  214. axis.set_title('GPU Utilization', fontsize = 16)
  215.  
  216. def plot_glm_results(axis, results, best_rmse, cb):
  217. axis.cla()
  218. axis.set_xscale('log')
  219. axis.set_xlim([0.1, 1e9])
  220. axis.set_ylim([-0.12, 1.12])
  221. axis.set_yticks([x/7. for x in range(0,8)])
  222. axis.set_ylabel('Parameter 1: '+r'$\alpha$', fontsize = 16)
  223. axis.set_xlabel('Parameter 2: '+r'$\lambda$', fontsize = 16)
  224. num_models = min(4000,int(4000*results.shape[0]/2570))
  225. axis.set_title('Elastic Net Models Trained and Evaluated: ' + str(num_models), fontsize = 16)
  226.  
  227. try:
  228. cm = ListedColormap(sns.color_palette("RdYlGn", 10).as_hex())
  229. cf = axis.scatter(results['lambda'], results['alpha_prime'], c=results['rel_acc'],
  230. cmap=cm, vmin=0, vmax=1)
  231. axis.plot(best_rmse['lambda'],best_rmse['alpha_prime'], 'o',
  232. ms=15, mec='k', mfc='none', mew=2)
  233.  
  234. if not cb:
  235. cb = pl.colorbar(cf, ax=axis)
  236. cb.set_label('Relative Validation Accuracy', rotation=270,
  237. labelpad=18, fontsize = 16)
  238. cb.update_normal(cf)
  239. except:
  240. #print("plot_glm_results exception -- no frame")
  241. pass
  242.  
  243. from py3nvml.py3nvml import *
  244. %matplotlib inline
  245. %config InlineBackend.figure_format = 'retina'
  246. import seaborn as sns
  247. sns.set_style("whitegrid")
  248. import psutil
  249. import numpy as np
  250. import pylab as pl
  251. from IPython import display
  252. import matplotlib.gridspec as gridspec
  253. from matplotlib.colors import ListedColormap
  254. import sys
  255. import subprocess
  256. maxNGPUS = int(subprocess.check_output("nvidia-smi -L | wc -l", shell=True))
  257. print("Maximum Number of GPUS:", maxNGPUS)
  258.  
  259. nvmlInit()
  260. deviceCount = nvmlDeviceGetCount()
  261. for i in range(deviceCount):
  262. handle = nvmlDeviceGetHandleByIndex(i)
  263. #print("Device {}: {}".format(i, nvmlDeviceGetName(handle)))
  264. #print ("Driver Version:", nvmlSystemGetDriverVersion())
  265.  
  266. import os
  267. def RunAnimation(arg):
  268. deviceCount = arg
  269. file = os.getcwd() + "/rmse.txt"
  270. fig = pl.figure(figsize = (9,9))
  271. pl.rcParams['xtick.labelsize'] = 14
  272. pl.rcParams['ytick.labelsize'] = 14
  273. gs = gridspec.GridSpec(3, 2, wspace=0.3, hspace=0.4)
  274. ax1 = pl.subplot(gs[0,-2])
  275. ax2 = pl.subplot(gs[0,1])
  276. ax3 = pl.subplot(gs[1:,:])
  277. fig.suptitle('H2O.ai Machine Learning $-$ Generalized Linear Modeling', size=18)
  278.  
  279. pl.gcf().subplots_adjust(bottom=0.2)
  280.  
  281. cb = False
  282. os.system("mkdir -p images")
  283. i=0
  284. while(True):
  285. try:
  286. #cpu
  287. snapshot = psutil.cpu_percent(percpu=True)
  288. cpu_labels = range(1,len(snapshot)+1)
  289. plot_cpu_perf(ax1, cpu_labels, snapshot)
  290.  
  291. #gpu
  292. gpu_snapshot = []
  293. gpu_labels = list(range(1,deviceCount+1))
  294. for j in range(deviceCount):
  295. handle = nvmlDeviceGetHandleByIndex(j)
  296. util = nvmlDeviceGetUtilizationRates(handle)
  297. gpu_snapshot.append(util.gpu)
  298. gpu_snapshot = gpu_snapshot
  299. plot_gpu_perf(ax2, gpu_labels, gpu_snapshot)
  300.  
  301. res = pd.read_csv(file, sep="\s+",header=None,names=['time','pass','fold','a','i','alpha','lambda','trainrmse','ivalidrmse','validrmse'])
  302.  
  303. res['rel_acc'] = ((38000- res['validrmse'])/(38000-28000))
  304. res['alpha_prime'] = res['alpha'] + res['fold'].apply(lambda x: new_alpha(x))
  305.  
  306. best = res.ix[res['rel_acc']==np.max(res['rel_acc']),:]
  307. plot_glm_results(ax3, res, best.tail(1), cb)
  308. # flag for colorbar to avoid redrawing
  309. cb = True
  310.  
  311. # Add footnotes
  312. footnote_text = "*U.S. Census dataset (predict Income): 45k rows, 10k cols\nParameters: 5-fold cross-validation, " + r'$\alpha = \{\frac{i}{7},i=0\ldots7\}$' + ", "\
  313. 'full $\lambda$-' + "search"
  314. #pl.figtext(.05, -.04, footnote_text, fontsize = 14,)
  315. pl.annotate(footnote_text, (0,0), (-30, -50), fontsize = 12,
  316. xycoords='axes fraction', textcoords='offset points', va='top')
  317.  
  318. #update the graphics
  319. display.display(pl.gcf())
  320. display.clear_output(wait=True)
  321. time.sleep(0.01)
  322.  
  323. #save the images
  324. saveimage=0
  325. if saveimage:
  326. file_name = './images/glm_run_%04d.png' % (i,)
  327. pl.savefig(file_name, dpi=200)
  328. i=i+1
  329.  
  330. except KeyboardInterrupt:
  331. break
  332. #except:
  333. # #print("Could not Create Frame")
  334. # pass
  335.  
  336.  
  337.  
  338.  
  339.  
  340. intercept = 1 #
  341. standardize = 0
  342. n=train_data_mat.shape[1]
  343. mTrain=train_data_mat.shape[0]
  344. if FRACTION<1.0:
  345. mValid=test_data_mat.shape[0]
  346. else:
  347. mValid=0
  348. print("n=%d mTrain=%d mValid=%d" % (n,mTrain,mValid))
  349. # Order of data
  350. fortran = 1
  351. print("fortran=%d" % (fortran))
  352. result={df[k].dtype for k in df.columns}
  353. print(result)
  354. print(fortran)
  355. if result.pop() == np.dtype('float64'):
  356. print("double precision")
  357. precision=1
  358. else:
  359. print("single precision")
  360. precision=0
  361.  
  362.  
  363. def RunH2Oaiglm(arg):
  364. intercept,standardize, lambda_min_ratio, nFolds, nAlphas, nLambdas, nGPUs = arg
  365.  
  366. # set solver cpu/gpu according to input args
  367. if((nGPUs>0) and (h2oaiglm.ElasticNetSolverGPU is None)):
  368. print("\nGPU solver unavailable, using CPU solver\n")
  369. nGPUs=0
  370.  
  371. sharedA = 0
  372. sourceme = 0
  373. sourceDev = 0
  374. nThreads = 1 if(nGPUs==0) else nGPUs # not required number of threads, but normal. Bit more optimal to use 2 threads for CPU, but 1 thread per GPU is optimal.
  375.  
  376. #print("Setting up Solver")
  377. os.system("rm -f rmse.txt ; touch rmse.txt ; rm -f varimp.txt ; touch varimp.txt")
  378. Solver = h2oaiglm.ElasticNetSolverGPU if(nGPUs>0) else h2oaiglm.ElasticNetSolverCPU
  379.  
  380.  
  381. # Solver = h2oaiglm.ElasticNetSolverCPU
  382. assert Solver != None, "Couldn't instantiate ElasticNetSolver"
  383. enet = Solver(sharedA, nThreads, nGPUs, 'c' if fortran else 'r', intercept, standardize, lambda_min_ratio, nLambdas, nFolds, nAlphas)
  384.  
  385. # Not using weights
  386. e=c_void_p(0)
  387.  
  388. print("Solving")
  389. ## Solve
  390. t0 = time.time()
  391. print("vars: %d %d %d %d %d %d %d" % (sourceDev, mTrain, n, mValid, intercept, standardize, precision))
  392. enet.fit(sourceDev, mTrain, n, mValid, intercept, standardize, precision, a, b, c, d, e)
  393. t1 = time.time()
  394. print("Done Solving")
  395. print("Time to train H2O AI GLM: %r" % (t1-t0))
  396.  
  397.  
  398.  
  399. lambda_min_ratio=1E-9
  400. nFolds=5
  401. nAlphas=8
  402. nLambdas=100
  403. nGPUs=maxNGPUS # choose all GPUs
  404.  
  405.  
  406. if run==1 and anim==0:
  407. # Run Model
  408. arg = intercept,standardize, lambda_min_ratio, nFolds, nAlphas, nLambdas, nGPUs
  409. RunH2Oaiglm(arg)
  410.  
  411. if run==1 and anim==1:
  412. from threading import Thread
  413.  
  414. # Run Model
  415. arg = intercept,standardize, lambda_min_ratio, nFolds, nAlphas, nLambdas, nGPUs
  416. background_thread = Thread(target=RunH2Oaiglm, args=(arg,))
  417. background_thread.start()
  418.  
  419.  
  420.  
  421. if run==1 and anim==1:
  422. # Show Animation
  423. arg = nGPUs
  424. RunAnimation(arg)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement