import os, json
import numpy as np
import pandas as pd
import calmap
import calendar
from datetime import date
import matplotlib.pyplot as plt

In the data set that you can download from Endomondo, there is one json file per workout. So I first start by creating a list of all the workout files that I can extract the data from one by one.

path_to_json = 'workouts/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

Now I can read the files one by one and extract the data I need: workout date and distance.

km = []
datetime = []

for f in json_files:
    with open('workouts/'+f) as json_file:
        json_data = json.load(json_file)

        d = {}
        for j in json_data:
            d.update(dict(j))

        if d['sport'] == 'RUNNING':
            km.append(d['distance_km'])
            datetime.append(d['start_time'])

Next, I create a data frame from the data I collected from the json files.

df = pd.DataFrame({'datetime':datetime, 'km':km})
df['datetime'] = pd.to_datetime(df['datetime'].str.strip(), format='%Y/%m/%d')
df.head()

Since I'm only interested in 2019 data, I filter out the rest.

import datetime
year = 2019
df_year = df.loc[(df['datetime'] >= datetime.datetime(year, 1, 1)) &
                 (df['datetime'] < datetime.datetime(year+1, 1, 1))]
df_year.head()

Now I'm ready to visualize the running data in a single calendar. There is one box per day, the darker the color for that day, the longer was the distance.

periods = 366 if calendar.isleap(year) else 365
all_days = pd.date_range(date(year, 1, 1), periods=periods, freq='D')
days = df_year['datetime'].values
runs = pd.Series(df_year['km'].values, index=days)

fig,ax = plt.subplots(1, 1, figsize = (16, 15))

calmap.yearplot(runs, year=year, ax=ax)

plt.savefig('calendar_2019.png', bbox_inches='tight', dpi=200)

plt.show()

Let's also have a look at some descriptive data on the runs.

df_year[['km']].describe()

total_km = round(df_year['km'].sum())
print(f'I ran a total of {total_km}km in 2019.')

I ran a total of 1246.0km in 2019.

days_ran_pct = round(df_year['km'].count()/365*100, 1)
print(f'I was out running on {days_ran_pct}% of days in 2019.')

I was out running on 38.1% of days in 2019.

times_per_week = round(df_year['km'].count()/365*7,1)
print(f'That is {times_per_week} times per week.')

That is 2.7 times per week.

	datetime	km
0	2019-09-30 14:00:00	7.500
1	2019-06-13 15:06:23	8.449
2	2020-02-12 15:43:59	7.351
3	2020-02-01 08:34:04	9.160
4	2019-01-01 16:25:18	7.394

	datetime	km
0	2019-09-30 14:00:00	7.500
1	2019-06-13 15:06:23	8.449
4	2019-01-01 16:25:18	7.394
9	2019-02-10 15:11:30	7.477
10	2019-03-04 15:51:16	7.460

	km
count	139.000000
mean	8.962129
std	2.412178
min	4.063000
25%	7.382000
50%	7.490000
75%	11.140500
max	15.509000