Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Libraries ##
- # Importing/Prepping Data
- using HTTP, JSONTables, DataFrames
- # Symbolic Regression
- using SymbolicRegression, MLJ
- ## Load Data ##
- url = raw"https://data.transportation.gov/resource/yj5y-b2ir.json?$limit=600000";
- dat = HTTP.get(url);
- ## Prep Data ##
- # Convert to DF
- df0 = DataFrame(jsontable(dat.body));
- # Remove string/etc columns
- # df = df0[!, [2:5; 8:10; 12:13; 15:16]];
- df = df0[!, [2:5; 8:16]];
- # Remove any rows with missing values (~500/600k)
- df = df[completecases(df), :];
- # One-hot encode categorical features
- onehot(x) = transpose(unique(x) .== permutedims(x));
- # Including one-hot city id lead to 700+ features
- # tmp = DataFrame(hcat(onehot(df.citymarketid_1), onehot(df.citymarketid_2),
- # onehot(df.carrier_lg), onehot(df.carrier_low)), :auto);
- # tmp_names = ["id1".*unique(df.citymarketid_1); "id2".*unique(df.citymarketid_2);
- # "lg".*unique(df.carrier_lg); "low".*unique(df.carrier_low)];
- tmp = DataFrame(hcat(onehot(df.carrier_lg), onehot(df.carrier_low)), :auto);
- tmp_names = ["lg".*unique(df.carrier_lg); "low".*unique(df.carrier_low)];
- rename!(tmp, Symbol.(tmp_names));
- # Remove original categorical features
- df = df[!, [1:2; 5:7; 9:10; 12:13]];
- # Convert columns to Float32
- for i in 1:9
- df[!, i]= parse.(Float32, df[!, i]);
- end
- # Add back one-hot categorical features
- df = hcat(df, Float32.(tmp))
- # Extract Fare column to use as target
- y = df.fare;
- X = df[:, Not(:fare)];
- # Remove high/low fare columns
- X = X[:, Not(:fare_lg, :fare_low)];
- ## Define Model ##
- # Operator complexity defaults to 1
- # https://astroautomata.com/SymbolicRegression.jl/stable/api/#Options
- # https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/Operators.jl
- model = SRRegressor(
- niterations = 50,
- elementwise_loss = L2DistLoss(),
- binary_operators = [+, -, *, /, ^, greater, mod, logical_and, logical_or],
- unary_operators = [neg, square, cube, exp, abs, log, log10, log2, log1p, sqrt, sin,
- cos, tan, sinh, cosh, tanh, atan, asinh, acosh, atanh_clip, erf,
- erfc, relu, round, floor, ceil, sign],
- complexity_of_operators = [(+) => 1.0, (-) => 1.0, (*) => 1.0,
- (/) => 1.0, (^) => 1.0],
- complexity_of_constants = 1.0,
- complexity_of_variables = 1.0,
- populations = 100,
- alpha = 0.1,
- maxsize = 20,
- parsimony = 0.0032,
- adaptive_parsimony_scaling = 20.0,
- population_size = 33,
- ncycles_per_iteration = 550,
- fraction_replaced = 0.00036,
- fraction_replaced_hof = 0.035,
- crossover_probability = 0.066,
- batching = true,
- batch_size = 100,
- warmup_maxsize_by = 0.5,
- turbo = false,
- print_precision = 2,
- should_simplify = true
- )
- ## Run Symbolic Regression ##
- mach = machine(model, X, y)
- fit!(mach)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement