Le Tour de France dataset webscraping
LeTour data set
This file downloads raw data about every rider of every Tour de France (from 1903 up to 2020). This data will then be postprocessed and stored in CSV format. Executing this notebook might take some minutes.
1) Retrieve urls for data extract
First we generate the urls that we need to download the raw HTML pages from the letour.fr
website to work offline from here on. The dataframe dflink will stores the respective url for each year.
Take a look at view-source:https://www.letour.fr/en/history
(at around line 1143-1890).
import os
import subprocess
import collections
import re
from pathlib import Path
import datetime
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
PREFIX = 'http://www.letour.fr'
HISTORYPAGE = 'https://www.letour.fr/en/history'
headers = {'Accept': 'text/html', 'User-Agent': 'python-requests/1.2.0','Accept-Charset': 'utf-8','accept-encoding': 'deflate, br'}
resulthistpage = requests.get(HISTORYPAGE, allow_redirects=True, headers=headers)
souphistory = BeautifulSoup(resulthistpage.text, 'html.parser')
# Find select tag for histo links
select_tag_histo = souphistory.find_all("button",{"class" :"dateTabs__link"})
LH = [x["data-tabs-ajax"] for x in select_tag_histo]
dflink = pd.DataFrame({'TDFHistorylink':LH})
dflink
TDFHistorylink | |
---|---|
0 | /en/block/history/10707/d0ab6a216569236433268b... |
1 | /en/block/history/10708/0b76b8f809ad5d8bcf3579... |
2 | /en/block/history/10709/c5f53ced72a23f333cc186... |
3 | /en/block/history/10710/ead1d1704b1600c795619e... |
4 | /en/block/history/10711/f558d3bc819c8ee5dc627d... |
... | ... |
102 | /en/block/history/10809/7be3a459d846b4672915c5... |
103 | /en/block/history/10810/7035a5dc53631209d3581f... |
104 | /en/block/history/11818/f34c3404d95a697dcf77d4... |
105 | /en/block/history/11819/96c0eb3fa403ebf222f28b... |
106 | /en/block/history/11820/17fa8e795e69e9f326ea26... |
107 rows × 1 columns
2) Get data from HTML pages and convert results to dataframes
2.1) Create function that convert HHh mm’ ss’’ to seconds
def calcTotalSeconds(row,mode):
val = sum(x * int(t) for x, t in zip([3600, 60, 1], row.replace("h",":").replace("'",":").replace('"',':').replace(" ","").replace("+","").replace("-","0").split(":")))
if ((mode=='Gap') and val > 180000) :
val=0
return val
2.2) Create a function that will retrieve elements from a source HTML page located on an input url
def getstagesNrank(i_url):
resultfull = requests.get(i_url, allow_redirects=True)
result = resultfull.text
resultstatus = resultfull.status_code
print(i_url + ' ==> HTTP STATUS = ' + str(resultstatus))
soup = BeautifulSoup(result, 'html.parser')
h=soup.find('h3')
year=int(h.text[-4:])
# Find select tag
select_tag = soup.find("select")
# find all option tag inside select tag
options = select_tag.find_all("option")
cols = ['Year','TotalTDFDistance','Stage']
lst = []
#search for stages
distance=soup.select("[class~=statsInfos__number]")[1].contents
#search for distance of the TDF edition
for option in options:
lst.append([year,int(distance[0].replace(" ","")),option.text])
dfstages = pd.DataFrame(lst, columns=cols)
# Find select tag for ranking racers
rankingTable = soup.find("table")
dfrank = pd.read_html(str(rankingTable))[0]
dfrank['Year']=year
dfrank['Distance (km)']=int(distance[0].replace(" ",""))
dfrank['Number of stages']=len(dfstages)
dfrank['TotalSeconds']=dfrank['Times'].apply(lambda x:calcTotalSeconds(x,'Total'))
dfrank['GapSeconds']=dfrank['Gap'].apply(lambda x:calcTotalSeconds(x,'Gap'))
return dfstages, dfrank
2.3) Loop on the dflink dataframe to get data from each url source
dfstagesres=[]
dfstagesrestmp=[]
dfrankres=[]
dfrankrestmp=[]
dfrankoutput=[]
for index, row in dflink.iterrows():
url = PREFIX+row['TDFHistorylink']
try :
if index >= 12 : # limit from 1919 (data need to be cleaned a little bit more before that)
dfstagesres, dfrankres=getstagesNrank(url)
dfstagesrestmp= dfstagesres.append(dfstagesrestmp, ignore_index=True)
dfrankrestmp=dfrankres.append(dfrankrestmp, ignore_index=True)
except:
raise
dfstageoutput=dfstagesrestmp
dfrankoutput=dfrankrestmp
http://www.letour.fr/en/block/history/10719/f15cf81b7599ea113d2b1d614f73bf83 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10720/53ac40095834ae1aae5d197aa730799a ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10721/c663ba39ac60d58c55a1a787a23025fc ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10722/cc5d0ba48a5d9bd9ee5e222160452b6f ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10723/9212137527c3f19c0cfccd1d07d80649 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10724/61621b1b7012e711ae7457a04e123aab ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10725/326ccc005b547ad2ad33e2fc03d27159 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10726/417b583aab876fdb00496da5c2be1a7d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10727/0ab2dd4f425fcdcb9cc0a1511f49c494 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10728/58abb1c5f80d5d1b088ead3c32a5a815 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10729/323ab10f97b3b3b6c865b63dcecf7e88 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10730/9047f2218cebb70010ff9fc50585e204 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10731/71efd6183471464d6be619275e187eee ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10732/994e5d74cdaede876fec2b451051c8e1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10733/be923012b4124a9bd0991212e9977d42 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10734/aed1c90b119f634ebc5d1e7e6a671faf ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10735/05bd6d64c780c558f7ffd7bd5879c776 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10736/53cfbe0f209af0e4d92fff714bf2f18d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10737/c40634701bbc32655fc85bb3ddde49f5 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10738/c2d7990fb2a094d495e11e8098c59c51 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10739/e8866f6cc4d0163829c95efc4a729463 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10740/c5548896e2ae8715122b3989d3ed76b1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10741/a722302e475af7214500f16e424e25d0 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10742/dfaf06d272727e8b57b63f72fcdbafa8 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10743/5a9ece659139988007c4e0f6f9fc5c20 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10744/367ba379f6d341a7bdc545a122d3c78d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10745/7381958972518a9845edee052c8edbfc ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10746/45ef03cc5c64c264bf96bf08d4384d3d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10747/98d1ca2953df385a9bc1c7b1b4749b2b ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10748/1f6f715544c4aec948adcb4085900c4c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10749/a896b855e40db82c92217ccdd5532cdb ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10750/e072737429413b7d608e99db5313b13e ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10751/bb0f8df099b6e1f53064105d9e789f1f ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10752/cb437a1a3262575de6df27be96d2cd55 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10753/e890ae42c19946e34da655a909f98bfe ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10754/1e38a3d734b196c4f8ac6ba7063b218b ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10755/ff596257302baac39a22e1f2e4faf016 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10756/89a04a27336b19dfe4d6a4ea2ee46582 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10757/a7983332f5a930e28087ae63a7874475 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10758/56a4797d820f12ec56f06298fab868b9 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10759/07dc37874db6b049e640b253672cf299 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10760/3b171e28d17eb88dbb829f02a9fb3709 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10761/c3e23b93c3d11a653857785d1136ffb6 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10762/e5cba06222169e8ea482b253c93fd5b9 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10763/adf45f6a131be9ab505e4d0f100418f7 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10764/9be91c73090072a334e97ec5d9b692b2 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10765/1602abcaab318c332c1479ecd27154e1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10766/5f1300708ea73ac042645afd5da10e29 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10767/62b2104a378309ef9ac5acc908e600df ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10768/12626d9920ed70ad18cdc7869e493ee0 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10769/9184e05910d640ad1af5cccc9321adbb ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10770/0b714daa83b004421e81d0ef62c8c0f3 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10771/395c856712f93e5289971151df458a61 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10772/3108137894d31c631408c1e2ac5510bb ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10773/ad0e4cf5a083a40d11bb15d1404559ed ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10774/5a2702b95bc1f297980241915b0c0eee ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10775/6365159573723beb801f343a6efb4763 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10776/1d486875bf13f11f37a63ee3a773ccc9 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10777/f1773d9b0704ed89afad4012905ec744 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10778/8845310ec3c8145620ac465f3e112cca ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10779/db843b1682239241b56010705153d657 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10780/fce0b5e49551473911dadd219da7cb6c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10781/39d1113fcd9febaf891760301f41a383 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10782/c5cd539d3aed4897b7a087c06fff4e5e ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10783/0d7fef284362baf0738e2569898bcc5c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10784/840372b1c5ef49ced963d12709d2bfe1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10785/88ea8997b2d00f171d74fdd18200c4ac ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10786/97ba8b33489f910f7217ee3f6bdd68c7 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10787/1a04d3c53ae89dcdf73df8027654c1db ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10788/d98288396c5dbd8446699ad88413632b ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10789/0073016ba550a84a666f752c64bafef8 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10790/ba6803b7f72483da9dcef054d3564bef ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10791/70c111aba67f7f32344852fc369f445d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10792/f892bbc452142340a7dfe883719d7637 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10793/532062cd8f6afe8ec3796beeee9fb60d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10794/f456958e5bf979def96740c09a39f0d0 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10795/0ed86366456d46e1defebebcedef4404 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10796/a0c4bbd9f74d2fb237f3fcfd985dcdfe ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10797/4c6e179ada14b5ea99feaa75cfc5cc27 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10798/dc1ea3d295e447e632d63c31d313e22c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10799/4df09e2d455b3d4bfea3d78091f0b0b1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10800/14026271ed58df742b5e2182adc964cb ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10801/7c59f5199df3e8a97888b3ab3deafadb ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10802/814bff61212e58cf7c2beaddb1c083b3 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10803/757e8e81b7183fde7329fd8ab9b69783 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10804/89b36b01ffe439e016ec1d59c57b63d1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10805/1ec24b4375d16b922c9c9c489d25465c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10806/196e36cfe7aff4b5fc2d1055f7dcf864 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10807/d538a7fbdfc0d657fbf064561840fbb1 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10808/3f166bedb535ee9bff2fb7ead0a7812c ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10809/7be3a459d846b4672915c576cb7ed6b9 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/10810/7035a5dc53631209d3581f64a433b10d ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/11818/f34c3404d95a697dcf77d4cd8e8278fa ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/11819/96c0eb3fa403ebf222f28b6e45115c56 ==> HTTP STATUS = 200
http://www.letour.fr/en/block/history/11820/17fa8e795e69e9f326ea26cf9912e571 ==> HTTP STATUS = 200
dfrankoutput
Rank | Rider | Rider No. | Team | Times | Gap | B | P | Year | Distance (km) | Number of stages | TotalSeconds | GapSeconds | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | TADEJ POGAČAR | 131 | UAE TEAM EMIRATES | 87h 20' 05'' | - | 32' | NaN | 2020 | 3483 | 21 | 314405 | 0 |
1 | 2 | PRIMOŽ ROGLIČ | 11 | TEAM JUMBO - VISMA | 87h 21' 04'' | + 00h 00' 59'' | 33' | NaN | 2020 | 3483 | 21 | 314464 | 59 |
2 | 3 | RICHIE PORTE | 101 | TREK - SEGAFREDO | 87h 23' 35'' | + 00h 03' 30'' | 04' | NaN | 2020 | 3483 | 21 | 314615 | 210 |
3 | 4 | MIKEL LANDA MEANA | 61 | BAHRAIN - MCLAREN | 87h 26' 03'' | + 00h 05' 58'' | NaN | NaN | 2020 | 3483 | 21 | 314763 | 358 |
4 | 5 | ENRIC MAS NICOLAU | 94 | MOVISTAR TEAM | 87h 26' 12'' | + 00h 06' 07'' | NaN | NaN | 2020 | 3483 | 21 | 314772 | 367 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9062 | 6 | JACQUES COOMANS | 63 | TDF 1919 *** | 246h 28' 49'' | + 15h 21' 34'' | NaN | NaN | 1919 | 5560 | 15 | 887329 | 55294 |
9063 | 7 | LUIGI LUCOTTI | 19 | TDF 1919 *** | 247h 08' 27'' | + 16h 01' 12'' | NaN | NaN | 1919 | 5560 | 15 | 889707 | 57672 |
9064 | 8 | JOSEPH VAN DAELE | 55 | TDF 1919 *** | 249h 30' 17'' | + 18h 23' 02'' | NaN | NaN | 1919 | 5560 | 15 | 898217 | 66182 |
9065 | 9 | ALFRED STEUX | 56 | TDF 1919 *** | 251h 36' 16'' | + 20h 29' 01'' | NaN | NaN | 1919 | 5560 | 15 | 905776 | 73741 |
9066 | 10 | JULES NEMPON | 151 | TDF 1919 *** | 252h 51' 27'' | + 21h 44' 12'' | NaN | NaN | 1919 | 5560 | 15 | 910287 | 78252 |
9067 rows × 13 columns
3) Clean up the data
# Fix result types
dfrankoutput["ResultType"] = "time"
dfrankoutput.loc[dfrankoutput["Year"].isin([1905,1906,1908]),"ResultType"] = "null"
dfrankoutput.loc[dfrankoutput["Year"].isin([1907,1909,1910,1911,1912]),"ResultType"] = "points"
# Fix this weird bug for e.g. year 2006 and 1997
for year in np.unique(dfrankoutput["Year"]):
tmp = dfrankoutput[dfrankoutput["Year"]==year].reset_index()
if tmp.loc[0]["TotalSeconds"] > tmp.loc[2]["TotalSeconds"]:
print(year)
# Okay seems to be only for 2006 and 1997
1997
2006
tmp = dfrankoutput[dfrankoutput["Year"]==2006].reset_index()
ts = np.array(tmp["TotalSeconds"])
gs = np.array(tmp["GapSeconds"])
ts[1:] = ts[0]+gs[1:]
dfrankoutput.loc[dfrankoutput["Year"]==2006,"TotalSeconds"] = ts
tmp = dfrankoutput[dfrankoutput["Year"]==1997].reset_index()
ts = np.array(tmp["TotalSeconds"])
gs = np.array(tmp["GapSeconds"])
ts[1:] = ts[0]+gs[1:]
dfrankoutput.loc[dfrankoutput["Year"]==1997,"TotalSeconds"] = ts
4) Write Data to CSV
dfrankoutput.to_csv("../data/TDF_Riders_History.csv")
dfstageoutput.to_csv("../data/TDF_Stages_History.csv")