Advertisement
Guest User

baseball safe lead data gathering

a guest
Nov 25th, 2015
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 13.31 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <vector>
  3. #include <iostream>
  4. #include <unordered_map>
  5. #include <cmath>
  6. #include <string.h>
  7.  
  8. /* Copyright 2015. GPLv3. */
  9.  
  10. /*
  11.  * compile like:
  12.  * g++ -std=c++11 safe.cpp -o a.out
  13.  *
  14.  * fetch data and run like:
  15.  * wget --quiet http://www.retrosheet.org/gamelogs/gl2014.zip
  16.  * unzip -p gl2014.zip | cut -d, -f10,11,12,20,21 | tr -d '"' | sed 's/x$/0/' | grep -E '[[:digit:]]+,[[:digit:]]+$' | ./a.out -p 50 | sort -n > 2014.csv
  17.  * open the csv in your favorite spreadsheet, add an erf(z) column if
  18.  * you want, and graph the results
  19.  */
  20.  
  21. /*
  22.  * this program deals with half-innings, because that's easy to get
  23.  * from box score data; we'll just try to get a formula that matches
  24.  * the curve, so we can interpolate safety values from that formula
  25.  * for mid-inning safety.
  26.  * That is why all 'outs' numbers in here are multiples of 3.
  27.  */
  28.  
  29. /*
  30.  * detailed instructions to parse input:
  31.  *
  32.  * get away score, home score, outs, away box, home box:
  33.  * - download game logs from retrosheet.org
  34.  *   - http://www.retrosheet.org/gamelogs/index.html
  35.  * - replace final-half-inning 'x' with '0'
  36.  * - remove error lines (quoted commas) (2014 had 5 such lines)
  37.  * cut -d, -f10,11,12,20,21 GL2014.TXT | tr -d '"' | sed 's/x$/0/' | grep -E '[[:digit:]]+,[[:digit:]]+$' > box.txt
  38.  *
  39.  * wc box.txt
  40.  * - 2430 games in a season, so this lets you figure out how many invalid lines you lost
  41.  *
  42.  * gnumeric <(cat box.txt | ./a.out | sort -n)
  43.  * - create an e column that is erf(z), so e.g. e2 is:
  44.  *   =erf(a2)
  45.  * - graph columns a (Z value) and d (percentage of times that Z was a
  46.      safe lead) and, if you created it, e
  47.  *   - the actual safety percentages should be somewhat similar to erf(z)
  48.  */
  49.  
  50.  
  51. /*
  52.  * the algorithms:
  53.  *
  54.  * z = L/sqrt(4*D*t)
  55.  * - L = lead (run differential)
  56.  * - D
  57.  *   - derived from games I've witnessed = 0.08835978835978836
  58.  *   - but I should use this program to get D for an entire MLB year
  59.  *     - D = 1/(2*(outs/ppg))
  60.  *     - 2014: 0.07552
  61.  *     - 2000-2014: 0.0853419
  62.  * - t = outs left
  63.  *
  64.  * so what I want to do is, for every non-0 L value from every half-inning
  65.  * - get z
  66.  * - record whether or not that z was safe (t >= safe_outs)
  67.  * then I can graph erf(z) and see if I get a useful curve
  68.  *
  69.  */
  70.  
  71.  
  72. // Game holds all the data from one game:
  73. // - an Inning vector, to hold away and home scores for each inning
  74. // - final scores for each team
  75. // - number of outs in the game (usually 51 (8.5 innings) or 54 (9 full innings)
  76. // - the out at which the last lead change happened (a multiple of 3, usually, since we don't analyze anything finer than half-innings)
  77. // - a 'valid' bool whether or not the data is internally consistent
  78. // There are a bunch of functions to
  79. // - parse() - accept an input line from retrosheet.org's 10,11,12,20,21 columns
  80. // - scanGame() - write to a hash table all the z values from every half-inning and whether or not the lead changed after that
  81. // - setSafety() - figure out when the last lead change happened
  82. // - assertInput() - ensure internal consistency of data
  83. // -
  84. class Game {
  85. private:
  86.    
  87.     class Inning {
  88.     public:
  89.         Inning() {};
  90.         ~Inning() {};
  91.         Inning(int a, int h) {away=a; home=h;};
  92.         int away;
  93.         int home;
  94.     };
  95.    
  96.    
  97. public:
  98.     Game() { innings.clear(); valid = false; };
  99.     ~Game() { };
  100.    
  101.     void append(Inning i) { innings.emplace_back(i); };
  102.     void append(int a, int h) { Inning i(a, h); append(i); };
  103.     void append(char a, char h) { append(a - 0x30, h - 0x30); };
  104.  
  105.  
  106.     void scanGame(std::unordered_map<float, std::pair<int, int>>& mymap, float D) {
  107.         //const float D = 0.08835978835978836;
  108.         int a = 0, h = 0, o = 0, oL = 0;
  109.         float z;
  110.         for (auto it : innings) {
  111.             a += it.away;
  112.             o += 3;
  113.             oL = o <= 54 ? 54 - o : 3;
  114.             if (a != h) {
  115.                 z = abs(a - h) / (sqrt(4 * D * oL));
  116.                 auto it = mymap.find(z);
  117.                 if (it != mymap.end()) {
  118.                     if (o >= safe_outs) {
  119.                         it->second.first++;
  120.                     } else {
  121.                         it->second.second++;
  122.                     }
  123.                 } else {
  124.                     if (o >= safe_outs) {
  125.                         std::pair<int, int> foo(1, 0);
  126.                         mymap.insert({z, foo});
  127.                     } else {
  128.                         std::pair<int, int> foo(0, 1);
  129.                         mymap.insert({z, foo});
  130.                     }
  131.                 }
  132.             }
  133.  
  134.             h += it.home;
  135.             o += 3;
  136.             if (o >= 54) {
  137.                 continue;
  138.             }
  139.             oL = 54 - o;
  140.             if (a != h) {
  141.                 z = abs(a - h) / (sqrt(4 * D * oL));
  142.                 auto it = mymap.find(z);
  143.                 if (it != mymap.end()) {
  144.                     if (o >= safe_outs) {
  145.                         it->second.first++;
  146.                     } else {
  147.                         it->second.second++;
  148.                     }
  149.                 } else {
  150.                     if (o >= safe_outs) {
  151.                         std::pair<int, int> foo(1, 0);
  152.                         mymap.insert({z, foo});
  153.                     } else {
  154.                         std::pair<int, int> foo(0, 1);
  155.                         mymap.insert({z, foo});
  156.                     }
  157.                 }
  158.             }
  159.            
  160.  
  161.            
  162.            
  163.         }
  164.     }
  165.    
  166.     void setSafety() {
  167.         if (!valid) return;
  168.         enum leader {
  169.             NONE,
  170.             AWAY,
  171.             HOME
  172.         };
  173.         int curA = 0;
  174.         int curH = 0;
  175.         int curO = 0; // outs
  176.         leader curL = NONE;
  177.         for (auto it : innings) {
  178.             curO += 3;
  179.             curA += it.away;
  180.             if ((curA > curH) && (curL != AWAY)) {
  181.                 curL = AWAY;
  182.                 safe_outs = curO;
  183.             }
  184.  
  185.             curO += 3;
  186.             curH += it.home;
  187.             if ((curH > curA) && (curL != HOME)) {
  188.                 curL = HOME;
  189.                 safe_outs = curO;
  190.             }
  191.            
  192.         }
  193.     };
  194.  
  195.     // make sure we have at least 9 innings, and the final score
  196.     // matches the sum of the scores through the innings.
  197.     bool assertInput() {
  198.         int af = awayF, hf = homeF;
  199.         if (innings.size() < 9) {
  200.             std::cerr << "too few innings" << std::endl;
  201.             return false;
  202.         }
  203.         for (auto it : innings) {
  204.             af -= it.away;
  205.             hf -= it.home;
  206.         }
  207.         if (0 != af || 0 != hf) {
  208.             std::cerr << "final score did not match" << af << hf << std::endl;
  209.             return false;
  210.         }
  211.         return true;
  212.            
  213.     }
  214.     // parse a line that looks like:
  215.     // away final, home final, number of outs in the game, away box score, home box score
  216.     // where a box score looks like '0123456789' which means 0 runs in
  217.     // the first inning, 1 in the second, 2 in the third, 3 in the
  218.     // fourth.... Probably fails badly for 10+ scored in an inning.
  219.     // if we don't have enough fields or at least 9 innings, we'll return false.
  220.     bool parse(std::string& s) {
  221.         size_t pos = 0;
  222.         bool okay = true;
  223.         std::string ia, ih;
  224.         if (okay) okay = getNext(s, &awayF);
  225.         if (okay) okay = getNext(s, &homeF);
  226.         if (okay) okay = getNext(s, &outs);
  227.         if (okay) okay = getNext(s, ia);
  228.         if (okay) okay = getNext(s, ih);
  229.         valid = okay;
  230.         if (valid) {
  231.             //std::cerr << ia << "-" << ih << std::endl;
  232.             for (int i = 0; i < ia.length() && i < ih.length(); i++) {
  233.                 append(ia.at(i), ih.at(i));
  234.             }
  235.         }
  236.         if (valid) {
  237.             valid = assertInput();
  238.         }
  239.         if (valid) {
  240.             setSafety();
  241.         }
  242.         return valid;
  243.     };
  244.     // lame. but works.
  245.     void print() {
  246.         if (!valid) return;
  247.  
  248.         for (auto it : innings) {
  249.             std::cout << it.away;
  250.         }
  251.         std::cout << " " << awayF;
  252.         if (awayF > homeF) {
  253.             std::cout << " (" << safe_outs << ")";
  254.         }
  255.         std::cout << std::endl;
  256.        
  257.         for (auto it : innings) {
  258.             std::cout << it.home;
  259.         }
  260.         std::cout << " " << homeF;
  261.         if (homeF > awayF) {
  262.             std::cout << " (" << safe_outs << ")";
  263.         }
  264.         std::cout << std::endl;
  265.        
  266.         std::cout << "=============" << std::endl;
  267.     };
  268.    
  269.    
  270.     std::vector<Inning> innings;
  271.     int awayF;     // final score, visitors
  272.     int homeF;     // final score, home team
  273.     int outs;      // number of outs in the game
  274.     int safe_outs; // game is considered safe after this number of outs
  275.     bool valid;    // data is internally consistent
  276.  
  277.  
  278.  
  279. private:
  280.     // 's' should look like: <int>[,...]
  281.     // we'll put the int into 'i' and remove the '<int>[,]' from 's'.
  282.     bool getNext(std::string& s, int* i) {
  283.         std::string r;
  284.         bool ret = getNext(s, r);
  285.         if (true == ret) {
  286.             *i = atoi(r.c_str());
  287.         }
  288.         return ret;
  289.     }
  290.     // 's' should look like '<string>[,...]'
  291.     // we'll put the <string> into 'r' and remove the '<string>[,]' from 's'
  292.     bool getNext(std::string& s, std::string& r) {
  293.         size_t pos = s.find(DELIM);
  294.         if (std::string::npos != pos) {
  295.             r.assign(s.substr(0, pos));
  296.             s.erase(0, pos + DELIM.length());
  297.             return true;
  298.         } else if (!s.empty()) {
  299.             r.assign(s);
  300.             s.clear();
  301.             return true;
  302.         }
  303.         return false;
  304.     }
  305.  
  306.    
  307.     const std::string DELIM = ",";
  308. }; // class Game
  309.  
  310. // D is the "diffusion coefficient" which for baseball is runs per
  311. // game divided by 2 times the number of outs per game:
  312. // D = 1/(2*(outs per game/runs per game))
  313.  
  314. // actually D = (p/(1-p))*((s^2)/(2*(T/N)))
  315. // - but 'p' is the probability that the team who just scored will
  316. //   score the next run. Let's assume it's always 50% (no preference
  317. //   for either team) and then p/(1-p) goes to 1.
  318. // - but 's' is the number of points scored per scoring event, i.e. 1.
  319. // - T is duration of the game. It's the number of seconds for most
  320. //   games, or 54 outs for us, but outs-per-game gives a better result
  321. //   than 54.
  322. // - N is the mean number of scoring events (i.e. runs) per game.
  323. float mycalcD(const std::vector<Game> games_vector) {
  324.     int outs = 0, runs = 0, games = 0;
  325.     for (auto it : games_vector) {
  326.         if (it.valid) {
  327.             outs += it.outs;
  328.             runs += it.awayF + it.homeF;
  329.             games++;
  330.         }
  331.     }
  332.     float rpg = (float)runs / (float)games;
  333.     float opg = (float)outs / (float)games;
  334.     float ret = 1 / (2 * (opg / rpg));
  335.     std::cerr << runs << "-" << games << "-" << rpg << "-" << ret << std::endl;
  336.     return ret;
  337. };
  338.  
  339. // command-line options:
  340. // '-v' call print() and show additional data for each game, to make sure parsing is happening correctly
  341. // '-p N' don't output games unless the number of safe or unsafe games for that z value are at *least* 'N' (this helps weed out statistically-insignificant z values)
  342. int main(int argc, char** argv) {
  343.    
  344.     std::string foo;
  345.     std::vector<Game> games;
  346.    
  347.     // read in the data from stdin (so you can 'cat foo.txt | ./a.out')
  348.     while (getline(std::cin, foo)) {
  349.         if (foo.empty()) {
  350.             break;
  351.         }
  352.         Game* g = new Game();
  353.         if (g->parse(foo)) {
  354.             // the input line was a "good" game
  355.             games.push_back(*g);
  356.         } else {
  357.             // something went wrong (probably not enough innings).
  358.             std::cerr << "failed." << std::endl;
  359.         }
  360.     };
  361.  
  362.  
  363.     // calculate the D value
  364.     float D = mycalcD(games);
  365.     std::cerr << "D = " << D << std::endl;
  366.  
  367.     // make a hash table of Z values; for each Z entry save the number
  368.     // of times the lead was safe and the number of times that lead
  369.     // wasn't safe.
  370.     std::unordered_map<float, std::pair<int, int>> z_table;
  371.     for (auto it : games) {
  372.         it.scanGame(z_table, D);
  373.     }
  374.  
  375.     // call this program with '-v' to print box scores back out, with
  376.     // the number of outs that had been made when each game went safe
  377.     if ((argc > 1) && (0 == strcmp(argv[1], "-v"))) {
  378.         for (auto it : games) {
  379.             it.print();
  380.         }
  381.     }
  382.  
  383.     // print out CSV from our hash table: Z,safe-Z,unsafe-Z,safe-Z %
  384.     // - Z: as defined above, we print every Z value we found
  385.     // - safe-Z: number of half-innings where we reached Z and the lead was never relinquished
  386.     // - unsafe-Z: number of half-innings we reached Z and after that there was a lead change
  387.     // - safe-Z%: safe-Z/(safe-Z+unsafe-Z)
  388.     // Just pass the output through 'sort -n' to get a sorted list.
  389.     std::cout << "z,#safe,#unsafe,pct-safe" << std::endl;
  390.     int limit = 0;
  391.     for (auto it : z_table) {
  392.       if ((argc > 2) && (0 == strcmp(argv[1], "-p"))) {
  393.         limit = atoi(argv[2]);
  394.       }
  395.       if ((it.second.first > limit) || (it.second.second > limit)) {
  396.         std::cout << it.first << "," << it.second.first << "," << it.second.second << "," << (float)((float)it.second.first / ((float)it.second.first + (float)it.second.second)) << std::endl;
  397.       }
  398.     }
  399.    
  400.     return 0;
  401. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement