Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <vector>
- #include <iostream>
- #include <unordered_map>
- #include <cmath>
- #include <string.h>
- /* Copyright 2015. GPLv3. */
- /*
- * compile like:
- * g++ -std=c++11 safe.cpp -o a.out
- *
- * fetch data and run like:
- * wget --quiet http://www.retrosheet.org/gamelogs/gl2014.zip
- * unzip -p gl2014.zip | cut -d, -f10,11,12,20,21 | tr -d '"' | sed 's/x$/0/' | grep -E '[[:digit:]]+,[[:digit:]]+$' | ./a.out -p 50 | sort -n > 2014.csv
- * open the csv in your favorite spreadsheet, add an erf(z) column if
- * you want, and graph the results
- */
- /*
- * this program deals with half-innings, because that's easy to get
- * from box score data; we'll just try to get a formula that matches
- * the curve, so we can interpolate safety values from that formula
- * for mid-inning safety.
- * That is why all 'outs' numbers in here are multiples of 3.
- */
- /*
- * detailed instructions to parse input:
- *
- * get away score, home score, outs, away box, home box:
- * - download game logs from retrosheet.org
- * - http://www.retrosheet.org/gamelogs/index.html
- * - replace final-half-inning 'x' with '0'
- * - remove error lines (quoted commas) (2014 had 5 such lines)
- * cut -d, -f10,11,12,20,21 GL2014.TXT | tr -d '"' | sed 's/x$/0/' | grep -E '[[:digit:]]+,[[:digit:]]+$' > box.txt
- *
- * wc box.txt
- * - 2430 games in a season, so this lets you figure out how many invalid lines you lost
- *
- * gnumeric <(cat box.txt | ./a.out | sort -n)
- * - create an e column that is erf(z), so e.g. e2 is:
- * =erf(a2)
- * - graph columns a (Z value) and d (percentage of times that Z was a
- safe lead) and, if you created it, e
- * - the actual safety percentages should be somewhat similar to erf(z)
- */
- /*
- * the algorithms:
- *
- * z = L/sqrt(4*D*t)
- * - L = lead (run differential)
- * - D
- * - derived from games I've witnessed = 0.08835978835978836
- * - but I should use this program to get D for an entire MLB year
- * - D = 1/(2*(outs/ppg))
- * - 2014: 0.07552
- * - 2000-2014: 0.0853419
- * - t = outs left
- *
- * so what I want to do is, for every non-0 L value from every half-inning
- * - get z
- * - record whether or not that z was safe (t >= safe_outs)
- * then I can graph erf(z) and see if I get a useful curve
- *
- */
- // Game holds all the data from one game:
- // - an Inning vector, to hold away and home scores for each inning
- // - final scores for each team
- // - number of outs in the game (usually 51 (8.5 innings) or 54 (9 full innings)
- // - the out at which the last lead change happened (a multiple of 3, usually, since we don't analyze anything finer than half-innings)
- // - a 'valid' bool whether or not the data is internally consistent
- // There are a bunch of functions to
- // - parse() - accept an input line from retrosheet.org's 10,11,12,20,21 columns
- // - scanGame() - write to a hash table all the z values from every half-inning and whether or not the lead changed after that
- // - setSafety() - figure out when the last lead change happened
- // - assertInput() - ensure internal consistency of data
- // -
- class Game {
- private:
- class Inning {
- public:
- Inning() {};
- ~Inning() {};
- Inning(int a, int h) {away=a; home=h;};
- int away;
- int home;
- };
- public:
- Game() { innings.clear(); valid = false; };
- ~Game() { };
- void append(Inning i) { innings.emplace_back(i); };
- void append(int a, int h) { Inning i(a, h); append(i); };
- void append(char a, char h) { append(a - 0x30, h - 0x30); };
- void scanGame(std::unordered_map<float, std::pair<int, int>>& mymap, float D) {
- //const float D = 0.08835978835978836;
- int a = 0, h = 0, o = 0, oL = 0;
- float z;
- for (auto it : innings) {
- a += it.away;
- o += 3;
- oL = o <= 54 ? 54 - o : 3;
- if (a != h) {
- z = abs(a - h) / (sqrt(4 * D * oL));
- auto it = mymap.find(z);
- if (it != mymap.end()) {
- if (o >= safe_outs) {
- it->second.first++;
- } else {
- it->second.second++;
- }
- } else {
- if (o >= safe_outs) {
- std::pair<int, int> foo(1, 0);
- mymap.insert({z, foo});
- } else {
- std::pair<int, int> foo(0, 1);
- mymap.insert({z, foo});
- }
- }
- }
- h += it.home;
- o += 3;
- if (o >= 54) {
- continue;
- }
- oL = 54 - o;
- if (a != h) {
- z = abs(a - h) / (sqrt(4 * D * oL));
- auto it = mymap.find(z);
- if (it != mymap.end()) {
- if (o >= safe_outs) {
- it->second.first++;
- } else {
- it->second.second++;
- }
- } else {
- if (o >= safe_outs) {
- std::pair<int, int> foo(1, 0);
- mymap.insert({z, foo});
- } else {
- std::pair<int, int> foo(0, 1);
- mymap.insert({z, foo});
- }
- }
- }
- }
- }
- void setSafety() {
- if (!valid) return;
- enum leader {
- NONE,
- AWAY,
- HOME
- };
- int curA = 0;
- int curH = 0;
- int curO = 0; // outs
- leader curL = NONE;
- for (auto it : innings) {
- curO += 3;
- curA += it.away;
- if ((curA > curH) && (curL != AWAY)) {
- curL = AWAY;
- safe_outs = curO;
- }
- curO += 3;
- curH += it.home;
- if ((curH > curA) && (curL != HOME)) {
- curL = HOME;
- safe_outs = curO;
- }
- }
- };
- // make sure we have at least 9 innings, and the final score
- // matches the sum of the scores through the innings.
- bool assertInput() {
- int af = awayF, hf = homeF;
- if (innings.size() < 9) {
- std::cerr << "too few innings" << std::endl;
- return false;
- }
- for (auto it : innings) {
- af -= it.away;
- hf -= it.home;
- }
- if (0 != af || 0 != hf) {
- std::cerr << "final score did not match" << af << hf << std::endl;
- return false;
- }
- return true;
- }
- // parse a line that looks like:
- // away final, home final, number of outs in the game, away box score, home box score
- // where a box score looks like '0123456789' which means 0 runs in
- // the first inning, 1 in the second, 2 in the third, 3 in the
- // fourth.... Probably fails badly for 10+ scored in an inning.
- // if we don't have enough fields or at least 9 innings, we'll return false.
- bool parse(std::string& s) {
- size_t pos = 0;
- bool okay = true;
- std::string ia, ih;
- if (okay) okay = getNext(s, &awayF);
- if (okay) okay = getNext(s, &homeF);
- if (okay) okay = getNext(s, &outs);
- if (okay) okay = getNext(s, ia);
- if (okay) okay = getNext(s, ih);
- valid = okay;
- if (valid) {
- //std::cerr << ia << "-" << ih << std::endl;
- for (int i = 0; i < ia.length() && i < ih.length(); i++) {
- append(ia.at(i), ih.at(i));
- }
- }
- if (valid) {
- valid = assertInput();
- }
- if (valid) {
- setSafety();
- }
- return valid;
- };
- // lame. but works.
- void print() {
- if (!valid) return;
- for (auto it : innings) {
- std::cout << it.away;
- }
- std::cout << " " << awayF;
- if (awayF > homeF) {
- std::cout << " (" << safe_outs << ")";
- }
- std::cout << std::endl;
- for (auto it : innings) {
- std::cout << it.home;
- }
- std::cout << " " << homeF;
- if (homeF > awayF) {
- std::cout << " (" << safe_outs << ")";
- }
- std::cout << std::endl;
- std::cout << "=============" << std::endl;
- };
- std::vector<Inning> innings;
- int awayF; // final score, visitors
- int homeF; // final score, home team
- int outs; // number of outs in the game
- int safe_outs; // game is considered safe after this number of outs
- bool valid; // data is internally consistent
- private:
- // 's' should look like: <int>[,...]
- // we'll put the int into 'i' and remove the '<int>[,]' from 's'.
- bool getNext(std::string& s, int* i) {
- std::string r;
- bool ret = getNext(s, r);
- if (true == ret) {
- *i = atoi(r.c_str());
- }
- return ret;
- }
- // 's' should look like '<string>[,...]'
- // we'll put the <string> into 'r' and remove the '<string>[,]' from 's'
- bool getNext(std::string& s, std::string& r) {
- size_t pos = s.find(DELIM);
- if (std::string::npos != pos) {
- r.assign(s.substr(0, pos));
- s.erase(0, pos + DELIM.length());
- return true;
- } else if (!s.empty()) {
- r.assign(s);
- s.clear();
- return true;
- }
- return false;
- }
- const std::string DELIM = ",";
- }; // class Game
- // D is the "diffusion coefficient" which for baseball is runs per
- // game divided by 2 times the number of outs per game:
- // D = 1/(2*(outs per game/runs per game))
- // actually D = (p/(1-p))*((s^2)/(2*(T/N)))
- // - but 'p' is the probability that the team who just scored will
- // score the next run. Let's assume it's always 50% (no preference
- // for either team) and then p/(1-p) goes to 1.
- // - but 's' is the number of points scored per scoring event, i.e. 1.
- // - T is duration of the game. It's the number of seconds for most
- // games, or 54 outs for us, but outs-per-game gives a better result
- // than 54.
- // - N is the mean number of scoring events (i.e. runs) per game.
- float mycalcD(const std::vector<Game> games_vector) {
- int outs = 0, runs = 0, games = 0;
- for (auto it : games_vector) {
- if (it.valid) {
- outs += it.outs;
- runs += it.awayF + it.homeF;
- games++;
- }
- }
- float rpg = (float)runs / (float)games;
- float opg = (float)outs / (float)games;
- float ret = 1 / (2 * (opg / rpg));
- std::cerr << runs << "-" << games << "-" << rpg << "-" << ret << std::endl;
- return ret;
- };
- // command-line options:
- // '-v' call print() and show additional data for each game, to make sure parsing is happening correctly
- // '-p N' don't output games unless the number of safe or unsafe games for that z value are at *least* 'N' (this helps weed out statistically-insignificant z values)
- int main(int argc, char** argv) {
- std::string foo;
- std::vector<Game> games;
- // read in the data from stdin (so you can 'cat foo.txt | ./a.out')
- while (getline(std::cin, foo)) {
- if (foo.empty()) {
- break;
- }
- Game* g = new Game();
- if (g->parse(foo)) {
- // the input line was a "good" game
- games.push_back(*g);
- } else {
- // something went wrong (probably not enough innings).
- std::cerr << "failed." << std::endl;
- }
- };
- // calculate the D value
- float D = mycalcD(games);
- std::cerr << "D = " << D << std::endl;
- // make a hash table of Z values; for each Z entry save the number
- // of times the lead was safe and the number of times that lead
- // wasn't safe.
- std::unordered_map<float, std::pair<int, int>> z_table;
- for (auto it : games) {
- it.scanGame(z_table, D);
- }
- // call this program with '-v' to print box scores back out, with
- // the number of outs that had been made when each game went safe
- if ((argc > 1) && (0 == strcmp(argv[1], "-v"))) {
- for (auto it : games) {
- it.print();
- }
- }
- // print out CSV from our hash table: Z,safe-Z,unsafe-Z,safe-Z %
- // - Z: as defined above, we print every Z value we found
- // - safe-Z: number of half-innings where we reached Z and the lead was never relinquished
- // - unsafe-Z: number of half-innings we reached Z and after that there was a lead change
- // - safe-Z%: safe-Z/(safe-Z+unsafe-Z)
- // Just pass the output through 'sort -n' to get a sorted list.
- std::cout << "z,#safe,#unsafe,pct-safe" << std::endl;
- int limit = 0;
- for (auto it : z_table) {
- if ((argc > 2) && (0 == strcmp(argv[1], "-p"))) {
- limit = atoi(argv[2]);
- }
- if ((it.second.first > limit) || (it.second.second > limit)) {
- std::cout << it.first << "," << it.second.first << "," << it.second.second << "," << (float)((float)it.second.first / ((float)it.second.first + (float)it.second.second)) << std::endl;
- }
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement