In [1]:
import os, json
import numpy as np
import pandas as pd
import calmap
import calendar
from datetime import date
import matplotlib.pyplot as plt

In the data set that you can download from Endomondo, there is one json file per workout. So I first start by creating a list of all the workout files that I can extract the data from one by one.

In [2]:
path_to_json = 'workouts/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

Now I can read the files one by one and extract the data I need: workout date and distance.

In [3]:
km = []
datetime = []

for f in json_files:
    with open('workouts/'+f) as json_file:
        json_data = json.load(json_file)

        d = {}
        for j in json_data:
            d.update(dict(j))

        if d['sport'] == 'RUNNING':
            km.append(d['distance_km'])
            datetime.append(d['start_time'])

Next, I create a data frame from the data I collected from the json files.

In [4]:
df = pd.DataFrame({'datetime':datetime, 'km':km})
df['datetime'] = pd.to_datetime(df['datetime'].str.strip(), format='%Y/%m/%d')
df.head()
Out[4]:
datetime km
0 2019-09-30 14:00:00 7.500
1 2019-06-13 15:06:23 8.449
2 2020-02-12 15:43:59 7.351
3 2020-02-01 08:34:04 9.160
4 2019-01-01 16:25:18 7.394

Since I'm only interested in 2019 data, I filter out the rest.

In [5]:
import datetime
year = 2019
df_year = df.loc[(df['datetime'] >= datetime.datetime(year, 1, 1)) &
                 (df['datetime'] < datetime.datetime(year+1, 1, 1))]
df_year.head()
Out[5]:
datetime km
0 2019-09-30 14:00:00 7.500
1 2019-06-13 15:06:23 8.449
4 2019-01-01 16:25:18 7.394
9 2019-02-10 15:11:30 7.477
10 2019-03-04 15:51:16 7.460

Now I'm ready to visualize the running data in a single calendar. There is one box per day, the darker the color for that day, the longer was the distance.

In [6]:
periods = 366 if calendar.isleap(year) else 365
all_days = pd.date_range(date(year, 1, 1), periods=periods, freq='D')
days = df_year['datetime'].values
runs = pd.Series(df_year['km'].values, index=days)

fig,ax = plt.subplots(1, 1, figsize = (16, 15))

calmap.yearplot(runs, year=year, ax=ax)

plt.savefig('calendar_2019.png', bbox_inches='tight', dpi=200)

plt.show()

Let's also have a look at some descriptive data on the runs.

In [7]:
df_year[['km']].describe()
Out[7]:
km
count 139.000000
mean 8.962129
std 2.412178
min 4.063000
25% 7.382000
50% 7.490000
75% 11.140500
max 15.509000
In [8]:
total_km = round(df_year['km'].sum())
print(f'I ran a total of {total_km}km in 2019.')
I ran a total of 1246.0km in 2019.
In [9]:
days_ran_pct = round(df_year['km'].count()/365*100, 1)
print(f'I was out running on {days_ran_pct}% of days in 2019.')
I was out running on 38.1% of days in 2019.
In [10]:
times_per_week = round(df_year['km'].count()/365*7,1)
print(f'That is {times_per_week} times per week.')
That is 2.7 times per week.