Correlation Analysis with Pandas and Seaborn

DodaTech 3 min read

In this tutorial, you'll learn about Correlation Analysis with Pandas and Seaborn. We cover key concepts, practical examples, and best practices to help you understand and apply this topic effectively.

What You'll Learn

Perform correlation analysis with pandas and Seaborn — calculate correlation coefficients, visualize correlation matrices, interpret results, and avoid common pitfalls.

Why It Matters

Correlation helps you understand relationships between variables, identify redundant features, and discover which features predict your target.

Real-World Use

Finding which features most strongly predict house prices, detecting multicollinearity before regression, or discovering that ice cream sales correlate with crime (spoiler: both increase in summer).

Types of Correlation

Type	Range	Detects	Best For
Pearson (r)	[-1, 1]	Linear relationships	Normally distributed data
Spearman (ρ)	[-1, 1]	Monotonic relationships	Ordinal data, non-linear
Kendall (τ)	[-1, 1]	Ordinal associations	Small samples, ties

Pearson Correlation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("housing.csv")

# Pearson correlation
pearson_corr = df["sqft_living"].corr(df["price"])
print(f"Pearson r = {pearson_corr:.3f}")
# r ≈ 0.70 — strong positive correlation

# Spearman (monotonic)
spearman_corr = df["sqft_living"].corr(df["price"], method="spearman")
print(f"Spearman ρ = {spearman_corr:.3f}")

# Kendall
kendall_corr = df["sqft_living"].corr(df["price"], method="kendall")
print(f"Kendall τ = {kendall_corr:.3f}")

Correlation Matrix

# Numeric columns only
numeric_df = df.select_dtypes(include=[np.number])

# Pearson correlation matrix
corr_matrix = numeric_df.corr()

# Print correlations with target
target_corr = corr_matrix["price"].sort_values(ascending=False)
print("Top 5 correlations with price:")
print(target_corr.head(6))
# price          1.000
# sqft_living    0.702
# grade          0.667
# sqft_above     0.606
# bathrooms      0.525
# bedrooms       0.308

Visualizing Correlation

Heatmap

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="coolwarm",
    vmin=-1, vmax=1,
    center=0,
    square=True,
    fmt=".2f",
    linewidths=0.5,
)
plt.title("Correlation Matrix of Housing Features")
plt.tight_layout()
plt.show()

Pairplot

# Scatter matrix of most correlated features
top_features = target_corr.head(5).index.tolist()

sns.pairplot(
    df[top_features],
    diag_kind="kde",
    plot_kws={"alpha": 0.5},
)
plt.suptitle("Pairplot of Top Features", y=1.02)
plt.show()

Correlation Bar Chart

plt.figure(figsize=(10, 6))
target_corr.drop("price").plot(kind="bar")
plt.axhline(y=0, color="gray", linestyle="-", linewidth=0.5)
plt.title("Feature Correlations with Price")
plt.ylabel("Pearson Correlation")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Interpreting Correlation

r = +1.0: Perfect positive correlation
r = +0.7: Strong positive
r = +0.5: Moderate positive
r = +0.3: Weak positive
r =  0.0: No linear correlation
r = -0.3: Weak negative
r = -0.5: Moderate negative
r = -0.7: Strong negative
r = -1.0: Perfect negative correlation

Correlation ≠ Causation

# Classic example: ice cream sales vs. drowning
data = pd.DataFrame({
    "month": range(1, 13),
    "ice_cream_sales": [10, 12, 15, 20, 30, 40, 45, 42, 35, 25, 15, 10],
    "drowning_incidents": [2, 2, 3, 5, 8, 12, 14, 13, 9, 6, 3, 2],
})

print(data["ice_cream_sales"].corr(data["drowning_incidents"]))
# r ≈ 0.98 — very high correlation

# But ice cream doesn't cause drowning!
# The confounder is temperature (summer → both increase)

Detecting Multicollinearity

High correlation between features hurts linear models:

# Find highly correlated feature pairs
corr_matrix = numeric_df.corr().abs()
upper_tri = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# Find pairs with |r| > 0.8
high_corr = [(col1, col2, upper_tri.loc[col1, col2])
    for col1 in upper_tri.columns
    for col2 in upper_tri.columns
    if upper_tri.loc[col1, col2] > 0.8 and col1 != col2]

print("Highly correlated feature pairs:")
for col1, col2, r in sorted(high_corr, key=lambda x: -x[2]):
    print(f"  {col1} × {col2}: r = {r:.2f}")

Partial Correlation

Correlation between two variables controlling for a third:

from scipy import stats
from sklearn.linear_model import LinearRegression

def partial_corr(x, y, z):
    """Compute partial correlation between x and y controlling for z."""
    # Regress x on z
    model_x = LinearRegression().fit(z, x)
    residuals_x = x - model_x.predict(z)

    # Regress y on z
    model_y = LinearRegression().fit(z, y)
    residuals_y = y - model_y.predict(z)

    # Correlation of residuals
    return stats.pearsonr(residuals_x, residuals_y)[0]

# Example: correlation between sqft_living and price, controlling for bedrooms
x = df["sqft_living"].values.reshape(-1, 1)
y = df["price"].values
z = df[["bedrooms"]]

print(f"Simple correlation: {df['sqft_living'].corr(df['price']):.3f}")
print(f"Partial correlation: {partial_corr(x, y, z):.3f}")

Complete Analysis Function

def correlation_report(df, target_col=None, threshold=0.5):
    """Generate a comprehensive correlation report."""
    numeric_df = df.select_dtypes(include=[np.number])
    corr = numeric_df.corr()

    print("=" * 60)
    print("CORRELATION ANALYSIS REPORT")
    print("=" * 60)

    if target_col and target_col in corr.columns:
        target = corr[target_col].drop(target_col).sort_values(ascending=False)
        print(f"\n📊 Top correlations with '{target_col}':")
        print(target.head(10))

    print("\n🔗 Highly correlated pairs (>|{}|):".format(threshold))
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    high = [(c1, c2, upper.loc[c1, c2])
        for c1 in upper.columns for c2 in upper.columns
        if abs(upper.loc[c1, c2]) > threshold]
    for c1, c2, r in sorted(high, key=lambda x: -abs(x[2])):
        print(f"  {c1} × {c2}: {r:.2f}")

    return corr

← Previous Data Normalization and Standardization Techniques Next → Data Visualization Best Practices

Built by the developers of DodaTech

Doda Browser, DodaZIP & Durga Antivirus Pro

Home Browse Data Science