Predict horse racing with machine learning and aim for a recovery rate of 100%.
Last time, I made a machine learning model that predicts horses that will be in the top 3 with LightGBM. This time, I would like to add "past performance of horses" as a feature, but scraping and data processing are quite difficult when actually trying to do it. So, I would like to summarize  what kind of code should be written and implemented </ font>.

First of all, scraping the past results of all horses running in 2019 from netkeiba.com. On netkeiba.com, horse_id is given for each horse, and the page URL of the past performance result is 「https://db.netkeiba.com/horse/(horse_id)」 Since it has the structure, scrape the necessary horse_id (and the jockey id) by processing the scrape_race_results function created in Previous article.
import time
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]
            # horse_id and jockey_scraping id
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "Race result"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )
            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                #If you use backslash in qiita, it will be buggy, so it is capitalized.
                horse_id_list.append(horse_id[0])
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "Race result"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])
            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            race_results[race_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results
Convert to DataFrame type referring to the previous article. This will give you a list of the horse_ids you need.
results = scrape_race_results(race_id_list)
results = pd.concat([results[key] for key in results])
horse_id_list = results['horse_id'].unique()
This is used to scrape past performance data.
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
            if df.columns[0]=='Award history':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results
It takes a long time, but after scraping, make it a DataFrame type again and save it in a pickle file.
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
df = pd.concat([horse_results[key] for key in horse_results])
df.to_pickle('horse_results.pickle')
Next, create a class called HorseResults and implement a function that merges the order of arrival and the average of the prize money.
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['date', 'Order of arrival', 'Prize money']]
        self.preprocessing()
        
    def preprocessing(self):
        df = self.horse_results.copy()
        #Remove items that contain non-numeric character strings in the order of arrival
        df['Order of arrival'] = pd.to_numeric(df['Order of arrival'], errors='coerce')
        df.dropna(subset=['Order of arrival'], inplace=True)
        df['Order of arrival'] = df['Order of arrival'].astype(int)
        df["date"] = pd.to_datetime(df["date"])
        df.drop(['date'], axis=1, inplace=True)
        
        #Fill the prize NaN with 0
        df['Prize money'].fillna(0, inplace=True)
    
        self.horse_results = df
        
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #Specify how many runs in the past
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        average = filtered_df.groupby(level=0)[['Order of arrival', 'Prize money']].mean()
        return average.rename(columns={'Order of arrival':'Order of arrival_{}R'.format(n_samples), 'Prize money':'Prize money_{}R'.format(n_samples)})
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df
With this, for example, if you want to add the results of the past 5 races to the feature quantity, you can implement it as follows.
hr = HorseResults(horse_results)
results_5R = hr.merge_all(results_p, n_samples=5)
Now you can see that the rightmost two columns have added the finish order and the average of the last five races of prize money.

Details are explained in the video ↓
Data analysis / machine learning starting with horse racing prediction

Recommended Posts