Analyzing Athlete Injury Prediction Data with Python

To explore the relationship between athlete attrbiutes and injury likelihood, we first examine age, weight, and height using data aggregation and visualization.

Analyzing by Age Groups

Method 1: Pivot Table

age_df = pd.pivot_table(df, values='Recovery_Time', index='Player_Age', columns='Likelihood_of_Injury', aggfunc='count')

# Rename columns using a dictionary
injury_dict = {0: 'NoLikelihood_Num', 1: 'Likelihood_Num'}
age_df.columns = age_df.columns.map(injury_dict)

# Calculate percentages
total = age_df['NoLikelihood_Num'] + age_df['Likelihood_Num']
age_df['NoLikelihood_Per'] = round(age_df['NoLikelihood_Num'] / total * 100, 2)
age_df['Likelihood_Per'] = round(age_df['Likelihood_Num'] / total * 100, 2)

Method 2: GroupBy with Apply

def age_per(x):
    total = x.shape[0]
    likelihood_num = (x['Likelihood_of_Injury'] == 1).sum()
    nolikelihood_num = (x['Likelihood_of_Injury'] == 0).sum()
    likelihood_per = round(likelihood_num / total * 100, 2)
    nolikelihood_per = round(nolikelihood_num / total * 100, 2)
    return pd.Series({'Likelihood_Per': likelihood_per, 'NoLikelihood_Per': nolikelihood_per})

age_df = age_df.groupby(by=['Age_Q'], as_index=False).apply(age_per)
display(age_df)

Visualization

sns.set_palette(my_palette)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(8, 5))

ax = sns.barplot(x='Age_Q', y='Likelihood_Per', data=age_df, label='Likelihood', width=0.2)
sns.barplot(x='Age_Q', y='NoLikelihood_Per', data=age_df, label='NoLikelihood', ax=ax, width=0.06)

ax.set_title('Percentage of Athletes by Age Group and Injury Likelihood', pad=10, fontsize=14)
ax.set_xlabel('Age_Q')
ax.set_ylabel('per(%)')

for idx, row in age_df.iterrows():
    ax.text(row.name, row['Likelihood_Per'], str(row['Likelihood_Per']) + '%', ha='center', va='bottom', fontsize=10)
    ax.text(row.name, row['NoLikelihood_Per'], str(row['NoLikelihood_Per']) + '%', ha='center', va='bottom', fontsize=10)

plt.legend(loc=4)
plt.show()

Insight: Young adulthood is often considered a prime period in sports; training intensity may be higher, leading to a greater chance of injury.

Analyzing by Weight Groups

weight_df = df.copy()

# Method 1: Quantile-based bins (may lack practical meaning)
weight_bins = np.quantile(weight_df['Player_Weight'], np.linspace(0, 1, 5))
weight_df['Weight_Q'] = pd.cut(weight_df['Player_Weight'], bins=weight_bins, labels=['40-68kg', '68-75kg', '75-81kg', '81-105kg'])

# Method 2: Manual quartile boundaries
weight_max = weight_df['Player_Weight'].max()
weight_min = weight_df['Player_Weight'].min()
weight_q2 = weight_df['Player_Weight'].quantile(0.25)
weight_q3 = weight_df['Player_Weight'].quantile(0.5)
weight_q4 = weight_df['Player_Weight'].quantile(0.75)
weight_bins = [weight_min, weight_q2, weight_q3, weight_q4, weight_max]
weight_df['Weight_Q'] = pd.cut(weight_df['Player_Weight'], bins=weight_bins, labels=['40-68kg', '68-75kg', '75-81kg', '81-105kg'])

# Method 3: Custom bins
weight_df['Weight_Q'] = pd.cut(weight_df['Player_Weight'], bins=[0, 50, 75, 90, 110], labels=['<50kg', '50-75kg', '75-90kg', '>90kg'])

def weight_per(x):
    total = x.shape[0]
    likelihood_num = (x['Likelihood_of_Injury'] == 1).sum()
    nolikelihood_num = (x['Likelihood_of_Injury'] == 0).sum()
    likelihood_per = round(likelihood_num / total * 100, 2)
    nolikelihood_per = round(nolikelihood_num / total * 100, 2)
    return pd.Series({'Likelihood_Per': likelihood_per, 'NoLikelihood_Per': nolikelihood_per})

weight_df = weight_df.groupby(by='Weight_Q', as_index=False).apply(weight_per)
display(weight_df)

# Visualization
sns.set_palette(my_palette)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(8, 5))

ax = sns.barplot(x='Weight_Q', y='Likelihood_Per', data=weight_df, width=0.2, label='Likelihood_Per')
sns.barplot(x='Weight_Q', y='NoLikelihood_Per', data=weight_df, ax=ax, width=0.06, label='NoLikelihood_Per')

for idx, row in weight_df.iterrows():
    plt.text(row.name, row['Likelihood_Per'], 'Green: ' + str(row['Likelihood_Per']) + '%', ha='center', va='bottom')
    plt.text(row.name, row['NoLikelihood_Per'], 'Red: ' + str(row['NoLikelihood_Per']) + '%', ha='center', va='top')

plt.title('Percentage of Athletes by Weight Group and Injury Likelihood', pad=10, fontsize=14)
plt.legend(loc=4)
plt.show()

Insight: Athletes under 50kg have an 83.3% chance of injury, indicating that lighter athletes are more prone to injuries.

Analyzing by Height Groups

height_df = df.copy()

# Quantile-based bins (may lack practical meaning)
height_bins = np.quantile(height_df['Player_Height'], np.linspace(0, 1, 5))
height_df['Height_Q'] = pd.cut(height_df['Player_Height'], bins=height_bins, labels=['145-173cm', '173-180cm', '180-187cm', '187-207cm'])

# Custom bins
height_df['Height_Q'] = pd.cut(height_df['Player_Height'], bins=[0, 165, 175, 185, 210], labels=['<165cm', '165-175cm', '175-185cm', '>185cm'])

def height_per(x):
    total = x.shape[0]
    likelihood_num = (x['Likelihood_of_Injury'] == 1).sum()
    nolikelihood_num = (x['Likelihood_of_Injury'] == 0).sum()
    likelihood_per = round(likelihood_num / total * 100, 2)
    nolikelihood_per = round(nolikelihood_num / total * 100, 2)
    return pd.Series({'Likelihood_Per': likelihood_per, 'NoLikelihood_Per': nolikelihood_per})

height_df = height_df.groupby(by='Height_Q', as_index=False).apply(height_per)
display(height_df)

# Visualization
sns.set_palette(my_palette)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(8, 5))

ax = sns.barplot(x='Height_Q', y='Likelihood_Per', data=height_df, width=0.2, label='Likelihood_Per')
sns.barplot(x='Height_Q', y='NoLikelihood_Per', data=height_df, ax=ax, width=0.06, label='NoLikelihood_Per')

for idx, row in height_df.iterrows():
    plt.text(row.name, row['Likelihood_Per'], str(row['Likelihood_Per']) + '%', ha='center', va='bottom', fontsize=10)
    plt.text(row.name, row['NoLikelihood_Per'], str(row['NoLikelihood_Per']) + '%', ha='center', va='bottom', fontsize=10)

plt.title('Percentage of Athletes by Height Group and Injury Likelihood', pad=10, fontsize=14)
plt.legend(loc=4)
plt.show()

Insight: Athletes in the 175-185cm range appear more prone to injury compared to other height groups.

Step 2: Feature Correlation Analysis

We use Pearson correlation coefficient.

correlation_matrix = df.corr(method='pearson')
display(correlation_matrix)

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(6, 5))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidth=0.5)
plt.title('Correlation Heatmap of Athlete Attributes and Injury Likelihood', pad=20)
plt.show()

Summary: The correlation between injury likelihood and training intensity is 0.089, indicating a weak positive relationship.

Step 3: Feature Importance Analysis

We use a Random Forest classifier to evaluate feature importance.

# Prepare data
X, y = df.iloc[:, :-1], df['Likelihood_of_Injury']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print(f"Model prediction accuracy: {round(accuracy_score(y_test, predictions), 2) * 100}%\n")

# Retrain on full dataset to get feature importances
model.fit(X, y)
feature_importances = (model.feature_importances_ * 100).round(2)

fi_df = pd.DataFrame(data=feature_importances, index=df.columns[:-1], columns=['Importance']).sort_values('Importance', ascending=False)

# Visualization
sns.set_palette(my_palette)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(8, 5))
plt.title('Random Forest Feature Importances', pad=20)

sns.barplot(x='Importance', y=fi_df.index, data=fi_df, width=0.5)

for idx, row in fi_df.iterrows():
    plt.text(row['Importance'], row.name, str(row['Importance']) + '%', ha='right', va='center')

plt.show()

The model achieves 53% accuracy, suggesting other models might be more suitable. The feature importance plot shows that training intensity, weight, and height are relatively important, which aligns with earlier analyses.

Tags: python Data Analysis Injury Prediction Sports Analytics Random Forest

Posted on Fri, 15 May 2026 01:59:52 +0000 by cneumann