Advertisement
Guest User

Untitled

a guest
Dec 9th, 2023
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.97 KB | None | 0 0
  1. ## Libraries ##
  2. # Importing/Prepping Data
  3. using HTTP, JSONTables, DataFrames
  4.  
  5. # Symbolic Regression
  6. using SymbolicRegression, MLJ
  7.  
  8.  
  9.  
  10. ## Load Data ##
  11. url = raw"https://data.transportation.gov/resource/yj5y-b2ir.json?$limit=600000";
  12. dat = HTTP.get(url);
  13.  
  14.  
  15.  
  16. ## Prep Data ##
  17. # Convert to DF
  18. df0 = DataFrame(jsontable(dat.body));
  19.  
  20. # Remove string/etc columns
  21. # df = df0[!, [2:5; 8:10; 12:13; 15:16]];
  22. df = df0[!, [2:5; 8:16]];
  23.  
  24.  
  25. # Remove any rows with missing values (~500/600k)
  26. df = df[completecases(df), :];
  27.  
  28.  
  29. # One-hot encode categorical features
  30. onehot(x) = transpose(unique(x) .== permutedims(x));
  31.  
  32. # Including one-hot city id lead to 700+ features
  33. # tmp = DataFrame(hcat(onehot(df.citymarketid_1), onehot(df.citymarketid_2),
  34. # onehot(df.carrier_lg), onehot(df.carrier_low)), :auto);
  35. # tmp_names = ["id1".*unique(df.citymarketid_1); "id2".*unique(df.citymarketid_2);
  36. # "lg".*unique(df.carrier_lg); "low".*unique(df.carrier_low)];
  37.  
  38. tmp = DataFrame(hcat(onehot(df.carrier_lg), onehot(df.carrier_low)), :auto);
  39. tmp_names = ["lg".*unique(df.carrier_lg); "low".*unique(df.carrier_low)];
  40. rename!(tmp, Symbol.(tmp_names));
  41.  
  42. # Remove original categorical features
  43. df = df[!, [1:2; 5:7; 9:10; 12:13]];
  44.  
  45. # Convert columns to Float32
  46. for i in 1:9
  47. df[!, i]= parse.(Float32, df[!, i]);
  48. end
  49.  
  50. # Add back one-hot categorical features
  51. df = hcat(df, Float32.(tmp))
  52.  
  53. # Extract Fare column to use as target
  54. y = df.fare;
  55. X = df[:, Not(:fare)];
  56.  
  57. # Remove high/low fare columns
  58. X = X[:, Not(:fare_lg, :fare_low)];
  59.  
  60.  
  61.  
  62. ## Define Model ##
  63. # Operator complexity defaults to 1
  64. # https://astroautomata.com/SymbolicRegression.jl/stable/api/#Options
  65. # https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/Operators.jl
  66. model = SRRegressor(
  67. niterations = 50,
  68. elementwise_loss = L2DistLoss(),
  69. binary_operators = [+, -, *, /, ^, greater, mod, logical_and, logical_or],
  70. unary_operators = [neg, square, cube, exp, abs, log, log10, log2, log1p, sqrt, sin,
  71. cos, tan, sinh, cosh, tanh, atan, asinh, acosh, atanh_clip, erf,
  72. erfc, relu, round, floor, ceil, sign],
  73. complexity_of_operators = [(+) => 1.0, (-) => 1.0, (*) => 1.0,
  74. (/) => 1.0, (^) => 1.0],
  75. complexity_of_constants = 1.0,
  76. complexity_of_variables = 1.0,
  77. populations = 100,
  78. alpha = 0.1,
  79. maxsize = 20,
  80. parsimony = 0.0032,
  81. adaptive_parsimony_scaling = 20.0,
  82. population_size = 33,
  83. ncycles_per_iteration = 550,
  84. fraction_replaced = 0.00036,
  85. fraction_replaced_hof = 0.035,
  86. crossover_probability = 0.066,
  87. batching = true,
  88. batch_size = 100,
  89. warmup_maxsize_by = 0.5,
  90. turbo = false,
  91. print_precision = 2,
  92. should_simplify = true
  93. )
  94.  
  95.  
  96.  
  97. ## Run Symbolic Regression ##
  98. mach = machine(model, X, y)
  99. fit!(mach)
  100.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement