A Git commit temporal analysis

In this Python notebook, we are going to analyze git commit timestamps across multiple repositories to identify temporal patterns in a git user coding activity (me, actually).

Outline

Imports and package versions
Repository Discovery and Data Extraction
- Data Collection
- Data Preprocessing
Visualizations

Imports and Package Versions

BASE_DIR is the root folder containing all the git repositories. The USER_FILTERS list contains substrings to match against git author names for filtering commits from a specific user with various names (github, gitlab from various organizations). You can adapt these two variables with your own directory and git user names.

import os
import subprocess
from collections import Counter
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tol_colors as tc

BASE_DIR = "/home/francois/Workspace"
USER_FILTERS = ["pacull", "djfrancesco"]

We are using Python 3.13.3 on a Linux OS:

pandas    : 2.2.3
numpy     : 2.2.6
matplotlib: 3.10.3
tol_colors: 2.0.0

Repository discovery and data extraction

Here we introduce functions to recursively scan directories for git repositories, extract commit metadata using git log in a Python subprocess, specifically commit timestamps and author names, and filter commits by author name using case-insensitive substring matching.

def is_git_repo(path):
    return os.path.isdir(os.path.join(path, ".git"))


def get_all_git_repos(base_dir):
    git_repos = []
    for root, dirs, files in os.walk(base_dir):
        if is_git_repo(root):
            git_repos.append(root)
            dirs.clear()
    return git_repos


def get_commits(repo_path):
    try:
        result = subprocess.run(
            ["git", "-C", repo_path, "log", "--pretty=format:%an|%aI"],
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            check=True,
            text=True,
        )
        lines = result.stdout.strip().split("\n")
        filtered_lines = []
        for line in lines:
            if line:
                author = line.split("|")[0].lower()
                if any(u.lower() in author for u in USER_FILTERS):
                    filtered_lines.append(line)
        return filtered_lines
    except subprocess.CalledProcessError:
        return []


def parse_commit_times(commit_lines):
    hours = []
    weekdays = []
    for line in commit_lines:
        author, iso_date = line.split("|")
        dt = datetime.fromisoformat(iso_date)
        hours.append(dt.hour)
        weekdays.append(dt.strftime("%A"))
    return hours, weekdays

Data collection

So let’s use these previous functions to iterate through the repositories, extract commit timestamps and parse them into hour-of-day and weekday components.

all_hours = []
all_weekdays = []

repos = get_all_git_repos(BASE_DIR)
for repo in repos:
    commits = get_commits(repo)
    hours, weekdays = parse_commit_times(commits)
    all_hours.extend(hours)
    all_weekdays.extend(weekdays)

print(f"Total commits found: {len(all_hours)}")

Total commits found: 7605

Data preprocessing

Now we convert the extracted data and create frequency dataframes for each hour of the day or day of the week.

hour_counts = Counter(all_hours)
hour_df = pd.DataFrame(
    {
        "hour": list(range(24)),
        "commit_count": [hour_counts.get(h, 0) for h in range(24)],
    }
)
hour_df = hour_df.set_index("hour")
hour_df["distrib"] = hour_df["commit_count"] / hour_df["commit_count"].sum()

days_order = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]
weekday_counts = Counter(all_weekdays)
weekday_df = pd.DataFrame(
    {
        "weekday": days_order,
        "commit_count": [weekday_counts.get(day, 0) for day in days_order],
    }
)
weekday_df = weekday_df.set_index("weekday")
weekday_df["distrib"] = weekday_df["commit_count"] / weekday_df["commit_count"].sum()

hour_df.head(3)

	commit_count	distrib
hour
0	12	0.001578
1	3	0.000394
2	0	0.000000

assert hour_df.distrib.sum() == 1.0

weekday_df.head(3)

	commit_count	distrib
weekday
Monday	1527	0.200789
Tuesday	1264	0.166206
Wednesday	1291	0.169757

assert weekday_df.distrib.sum() == 1.0

Visualizations

Weekly distribution

Here we create a bar chart showing normalized commit frequency across days of the week.

plt.figure(figsize=(10, 6))
colormap = tc.YlOrBr
color = colormap(0.5)
plt.bar(
    weekday_df.index,
    weekday_df["distrib"],
    color=color,
    alpha=0.7,
    edgecolor="black",
    linewidth=0.5,
)
plt.title("Distribution of commits by day of week", fontsize=18)
plt.xlabel("Weekday", fontsize=16)
plt.ylabel("Relative frequency", fontsize=16)
plt.tick_params(axis="both", labelsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.3)
plt.tight_layout()

weekday

Hourly Distribution (Linear)

The next figure displays commit frequency across a 24-hour period.

fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(
    hour_df.index,
    hour_df["distrib"],
    color=color,
    alpha=0.7,
    edgecolor="black",
    linewidth=0.5,
)
ax.set_title("Commit distribution by hour of day (linear)", fontsize=18, pad=20)
ax.set_xlabel("Hour of day", fontsize=16)
ax.set_ylabel("Relative frequency", fontsize=16)
ax.set_xlim(-0.5, 23.5)
ax.grid(axis="y", linestyle="--", alpha=0.3)
ax.set_xticks(range(24))
ax.set_xticklabels([f"{h:02d}" for h in range(24)])
plt.tick_params(axis="both", labelsize=14)
plt.tight_layout()

hourly linear

Hourly Distribution (polar)

This is the same data as avove, but plotted in polar coordinates.

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))
theta = np.linspace(0, 2 * np.pi, 24, endpoint=False)
radii = hour_df["distrib"].values
ax.bar(
    theta,
    radii,
    width=2 * np.pi / 24,
    color=color,
    alpha=0.7,
    edgecolor="black",
    linewidth=0.5,
)
ax.set_theta_zero_location("N")
ax.set_theta_direction(-1)
ax.set_thetagrids(range(0, 360, 15), [f"{h:02d}" for h in range(0, 24, 1)])
ax.set_ylim(0, max(radii) * 1.1)
ax.set_yticklabels([])
plt.tick_params(axis="both", labelsize=14)
ax.set_title("Commit distribution by hour of day (polar)", fontsize=18, pad=20)
ax.grid(True, alpha=0.3)
plt.tight_layout()

hourly polar

Temporal heatmap

The figure here is a two-dimensional heatmap correlating weekday and hour-of-day commit patterns. Data is normalized to show percentage of total commits.

commit_data = []
for i, (hour, weekday) in enumerate(zip(all_hours, all_weekdays)):
    commit_data.append({"hour": hour, "weekday": weekday})

commit_df = pd.DataFrame(commit_data)
heatmap_data = commit_df.groupby(["weekday", "hour"]).size().unstack(fill_value=0)
heatmap_data = heatmap_data.reindex(days_order)

all_hours_cols = list(range(24))
heatmap_data = heatmap_data.reindex(columns=all_hours_cols, fill_value=0)
heatmap_normalized = heatmap_data / heatmap_data.sum().sum() * 100

heatmap_normalized.head(3)

hour	0	1	...	22	23
weekday
Monday	0.000000	0.0	...	0.499671	0.092045
Tuesday	0.039448	0.0	...	0.197239	0.078895
Wednesday	0.078895	0.0	...	0.262985	0.026298

3 rows × 24 columns

assert np.isclose(
    np.float64(heatmap_normalized.sum().sum()), 100.0, rtol=1e-10, atol=1e-10
)

fig, ax = plt.subplots(figsize=(16, 8))
im = ax.imshow(
    heatmap_normalized,
    aspect="auto",
    cmap="tol.YlOrBr",
    interpolation="nearest",
    vmin=0,
)
ax.set_xticks(np.arange(24))
ax.set_yticks(np.arange(len(days_order)))
ax.set_xticklabels([f"{h:02d}" for h in range(24)])
ax.set_yticklabels(days_order)
plt.setp(ax.get_xticklabels(), rotation=0, ha="center")
cbar = plt.colorbar(im, ax=ax)
cbar.set_label(
    "Percentage of total commits (%)", rotation=270, labelpad=20, fontsize=16
)
cbar.ax.tick_params(labelsize=14)
ax.set_title("Commit activity heatmap by hour and weekday", fontsize=18, pad=20)
ax.set_xlabel("Hour of day", fontsize=16)
ax.set_ylabel("Day of week", fontsize=16)
ax.set_xticks(np.arange(24) - 0.5, minor=True)
ax.set_yticks(np.arange(len(days_order)) - 0.5, minor=True)
plt.tick_params(axis="both", labelsize=14)
ax.grid(which="minor", color="gray", linestyle="-", linewidth=0.5, alpha=0.3)
plt.tight_layout()

heatmap

So it seems that friday 4 pm is my most productive hour of the week!