Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- namespace HousePrices {
- using System;
- using System.Collections.Generic;
- using System.Data;
- using System.Diagnostics;
- using System.IO;
- using System.Linq;
- using CsvHelper;
- using numpy;
- using SharPy.Runtime;
- using tensorflow;
- using tensorflow.keras;
- using tensorflow.keras.callbacks;
- using tensorflow.keras.layers;
- using tensorflow.train;
- static class HousePricesProgram {
- static void Main() {
- const string TrainFile = "train.csv";
- DataTable trainData = LoadData(TrainFile);
- trainData.Columns.Remove("Id");
- var columns = trainData.Columns.Cast<DataColumn>();
- var types = columns.Select(column => column.DataType).Counts();
- var rows = trainData.Rows.Cast<DataRow>();
- var (trainRows, testRows) = rows.RandomSplit(primaryChance: 1);
- var columnTypes = columns.Select(column => {
- var values = rows.Select(row => (string)row[column]);
- var trainValues = trainRows.Select(row => (string)row[column]);
- var testValues = testRows.Select(row => (string)row[column]);
- double floats = values.Percentage(v => double.TryParse(v, out _));
- double ints = values.Percentage(v => int.TryParse(v, out _));
- int distincts = values.Distinct().Count();
- var normalizer = ValueNormalizer(floats, values);
- return new { column, values, distincts, ints, floats, normalizer, trainValues, testValues };
- }).OrderBy(c => c.distincts).ThenBy(c => c.column.ColumnName)
- .ToArray();
- const string predict = "SalePrice";
- ndarray GetInputs(IEnumerable<DataRow> rowSeq) {
- return np.array(rowSeq.Select(row => np.array(
- columnTypes.Where(c => c.column.ColumnName != predict)
- .SelectMany(column => column.normalizer(row.Table.Columns.Contains(column.column.ColumnName) ? (string)row[column.column.ColumnName] : "-1")).ToArray()))
- .ToArray());
- }
- var predictColumn = columnTypes.Single(c => c.column.ColumnName == predict);
- ndarray trainOutputs = np.array(predictColumn.trainValues.AsDouble().Select(v => v ?? -1).ToArray());
- ndarray trainInputs = GetInputs(trainRows);
- //ndarray testOutputs = np.array(predictColumn.testValues.AsDouble().Select(v => v ?? -1).ToArray());
- //ndarray testInputs = GetInputs(testRows);
- //Debug.Assert(testOutputs.Length == testInputs.Length);
- //Debug.Assert(testOutputs.Length > 20);
- Debug.Assert(trainOutputs.Length == trainInputs.Length);
- Debug.Assert(trainOutputs.Length > 20);
- //Debug.Assert(trainOutputs.Length != testOutputs.Length);
- var model = new Sequential(new Layer[] {
- new Dense(units: 16, activation: tf.nn.relu_fn),
- new Dropout(rate: 0.1),
- new Dense(units: 10, activation: tf.nn.relu_fn),
- new Dense(units: 1, activation: tf.nn.relu_fn),
- });
- model.compile(optimizer: new AdamOptimizer(), loss: "mean_squared_error");
- //var tensorboard = new TensorBoard(log_dir: $"./logs/{DateTime.Now.ToString("s").Replace(':','-')}");
- model.fit(trainInputs, trainOutputs, epochs: 20000, validation_split: 0.075, verbose: 2);
- const string SubmissionInputFile = "test.csv";
- DataTable submissionData = LoadData(SubmissionInputFile);
- var submissionRows = submissionData.Rows.Cast<DataRow>();
- ndarray submissionInputs = GetInputs(submissionRows);
- ndarray sumissionOutputs = model.predict(submissionInputs);
- Console.WriteLine("guesses:");
- var random = new Random();
- using (var writer = new StreamWriter("submit.csv")) {
- writer.WriteLine("Id,SalePrice");
- foreach (var (id, prediction) in submissionRows.Select(row => int.Parse((string)row["Id"]))
- .Pair(sumissionOutputs.Cast<ndarray>())) {
- string guess = $"{id},{prediction[0]}";
- writer.WriteLine(guess);
- if (random.Next(100) > 99)
- Console.WriteLine(guess);
- }
- writer.Flush();
- }
- //float64 trainLoss = model.evaluate(trainInputs, trainOutputs);
- //float64 testLoss = model.evaluate(testInputs, testOutputs);
- //Console.WriteLine($"Test loss: {(int)Math.Sqrt(testLoss)}; Train loss: {(int)Math.Sqrt(trainLoss)}");
- //Console.WriteLine();
- //foreach (var column in columnTypes)
- // Console.WriteLine($"{column.column.ColumnName}: {column.distincts} values, ints: {column.ints:P2}, floats: {column.floats:P2}");
- //Console.WriteLine();
- //Console.WriteLine("Many value columns:");
- //foreach (var column in columnTypes.Where(ct => ct.distincts > 10 && ct.floats < 0.01)) {
- // Console.Write(column.column.ColumnName + ": ");
- // Console.WriteLine(string.Join(", ", column.values.Distinct().OrderBy(n => n)));
- //}
- //Console.WriteLine();
- //Console.WriteLine("non-parsable floats");
- //foreach (var column in columnTypes.Where(ct => ct.floats > 0 && ct.floats < 1)) {
- // Console.Write(column.column.ColumnName + ": ");
- // Console.WriteLine(string.Join(", ", column.values.Where(v => !double.TryParse(v, out _)).Distinct().OrderBy(n => n)));
- //}
- //Console.WriteLine();
- //Console.WriteLine("float ranges:");
- //foreach (var column in columnTypes.Where(ct => ct.floats > 0.01)) {
- // Console.Write(column.column.ColumnName + ": ");
- // var validValues = column.values.AsDouble().Where(v => v != null).Select(v => v.Value);
- // Console.WriteLine($"{validValues.Min()}...{validValues.Max()}");
- //}
- //Console.WriteLine();
- }
- static IEnumerable<(T1, T2)> Pair<T1, T2>(this IEnumerable<T1> seq1, IEnumerable<T2> seq2)
- => seq1.Zip(seq2, (v1, v2) => (v1, v2));
- static (List<T>, List<T>) RandomSplit<T>(this IEnumerable<T> seq, double primaryChance) {
- var random = new Random();
- var primary = new List<T>();
- var secondary = new List<T>();
- foreach(var item in seq) {
- if (random.NextDouble() < primaryChance)
- primary.Add(item);
- else
- secondary.Add(item);
- }
- return (primary, secondary);
- }
- static DataTable LoadData(string csvFilePath) {
- var result = new DataTable();
- using (var reader = new CsvDataReader(new CsvReader(new StreamReader(csvFilePath)))) {
- result.Load(reader);
- }
- return result;
- }
- static IEnumerable<double?> AsDouble(this IEnumerable<string> seq) {
- foreach(var value in seq)
- yield return double.TryParse(value, out var result) ? result : (double?)null;
- }
- static Func<string, double[]> ValueNormalizer(double floats, IEnumerable<string> values) {
- if (floats > 0.01) {
- double max = values.AsDouble().Max().Value;
- return s => new[] { double.TryParse(s, out double v) ? v / max : -1 };
- } else {
- string[] domain = values.Distinct().OrderBy(v => v).ToArray();
- return s => new double[domain.Length+1].Set(Array.IndexOf(domain, s)+1, 1);
- }
- }
- static double Percentage<T>(this IEnumerable<T> seq, Func<T, bool> predicate) {
- int total = 0;
- int matching = 0;
- foreach(var item in seq) {
- total++;
- matching += predicate(item) ? 1 : 0;
- }
- return matching * 1.0 / total;
- }
- static T[] Set<T>(this T[] array, int index, T value) {
- array[index] = value;
- return array;
- }
- static void PrettyPrint<TKey, TValue>(TextWriter writer, IReadOnlyDictionary<TKey, TValue> dict) {
- bool multiline = dict.Count > 10;
- string separator = multiline ? "," + Environment.NewLine : ",";
- string prefix = multiline ? " " : " ";
- writer.Write('{');
- if (multiline)
- writer.WriteLine();
- foreach(var entry in dict) {
- writer.Write(prefix);
- writer.Write(entry.Key);
- writer.Write(": ");
- writer.Write(entry.Value);
- writer.Write(separator);
- }
- writer.Write('}');
- }
- static IReadOnlyDictionary<T, int> Counts<T>(this IEnumerable<T> seq) {
- var result = new Dictionary<T, int>();
- foreach(var item in seq) {
- result.TryGetValue(item, out int count);
- result[item] = count + 1;
- }
- return result;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement