Skip to content

Feature Engineering for Machine Learning

DodaTech 3 min read

In this tutorial, you'll learn about Feature Engineering for Machine Learning. We cover key concepts, practical examples, and best practices to help you understand and apply this topic effectively.

What You'll Learn

Engineer features for Machine Learning models — create numeric features from raw data, encode categorical variables, handle dates, create interactions, and select features.

Why It Matters

Better features beat better models. Feature engineering is where domain knowledge meets Data Science — it's often the difference between a good and great model.

Real-World Use

Creating day-of-week features from timestamps, encoding product categories for a recommendation system, or generating price-to-income ratio from two raw columns.

Numeric Features

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv("house_prices.csv")

# Log transform (for skewed distributions)
df["price_log"] = np.log1p(df["price"])

# Square and polynomial features
df["area_sq"] = df["area"] ** 2
df["area_cubed"] = df["area"] ** 3

# Ratios (domain-specific)
df["price_per_sqft"] = df["price"] / df["area"]
df["bedrooms_per_bath"] = df["bedrooms"] / df["bathrooms"]

# Binning
df["age_group"] = pd.cut(
    df["house_age"],
    bins=[0, 5, 20, 50, 100],
    labels=["New", "Recent", "Old", "Historic"],
)

# Scaling (for models like SVM, neural networks)
scaler = StandardScaler()
df["price_scaled"] = scaler.fit_transform(df[["price"]])

# Min-Max scaling (to [0, 1])
scaler = MinMaxScaler()
df["area_norm"] = scaler.fit_transform(df[["area"]])

Categorical Features

# One-hot encoding
encoded = pd.get_dummies(df["neighborhood"], prefix="hood")
df = pd.concat([df, encoded], axis=1)

# sklearn's OneHotEncoder (better for production)
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded = encoder.fit_transform(df[["neighborhood"]])
feature_names = encoder.get_feature_names_out(["neighborhood"])
encoded_df = pd.DataFrame(encoded, columns=feature_names)

# Label encoding (ordinal categories)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["city_encoded"] = le.fit_transform(df["city"])

# Target encoding (mean of target per category)
target_mean = df.groupby("neighborhood")["price"].mean()
df["neighborhood_target_encoded"] = df["neighborhood"].map(target_mean)

# Frequency encoding
freq = df["neighborhood"].value_counts() / len(df)
df["neighborhood_freq"] = df["neighborhood"].map(freq)

Date Features

# Convert to datetime
df["date"] = pd.to_datetime(df["date"])

# Time-based features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["dayofweek"] = df["date"].dt.dayofweek
df["quarter"] = df["date"].dt.quarter
df["is_weekend"] = df["date"].dt.dayofweek >= 5
df["is_month_start"] = df["date"].dt.is_month_start
df["is_month_end"] = df["date"].dt.is_month_end

# Cyclical encoding (for models that don't handle circular features)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# Time since a reference date
reference = pd.Timestamp("2020-01-01")
df["days_since_reference"] = (df["date"] - reference).dt.days

# Lag features
df = df.sort_values("date")
df["sales_lag_1"] = df["sales"].shift(1)
df["sales_lag_7"] = df["sales"].shift(7)

# Rolling features
df["sales_rolling_7"] = df["sales"].rolling(7).mean()
df["sales_rolling_30"] = df["sales"].rolling(30).mean()

Interaction Features

from sklearn.preprocessing import PolynomialFeatures

# Manual interactions
df["age_income_interaction"] = df["age"] * df["income"]
df["bedrooms_x_area"] = df["bedrooms"] * df["area"]

# PolynomialFeatures (automated)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
feature_cols = ["age", "income", "rooms"]
poly_features = poly.fit_transform(df[feature_cols])

# Get feature names
poly_names = poly.get_feature_names_out(feature_cols)
poly_df = pd.DataFrame(poly_features, columns=poly_names)

Text Features

# Length of text
df["description_length"] = df["description"].str.len()
df["word_count"] = df["description"].str.split().str.len()

# Number of uppercase words
df["capital_words"] = df["description"].str.findall(r"[A-Z][a-z]+").str.len()

# Extract hashtags
df["hashtag_count"] = df["text"].str.count(r"#\w+")
df["mention_count"] = df["text"].str.count(r"@\w+")

# TF-IDF features (for text-heavy data)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = vectorizer.fit_transform(df["description"])
tfidf_df = pd.DataFrame(
    tfidf_features.toarray(),
    columns=vectorizer.get_feature_names_out(),
)

Feature Selection

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# Separate features and target
X = df.select_dtypes(include=[np.number]).drop("target", axis=1)
y = df["target"]

# Remove low-variance features
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_high_variance = selector.fit_transform(X)

# Select K best with F-test
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

# Mutual information
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Correlation with target
correlations = X.corrwith(y).abs().sort_values(ascending=False)

Feature Engineering Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_features = ["age", "income", "rooms"]
categorical_features = ["neighborhood", "city"]
date_features = ["date"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, interaction_only=True)),
])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

date_transformer = Pipeline([
    ("extract", DateFeatureExtractor()),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
    ("date", date_transformer, date_features),
])

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor()),
])

Quick Reference

Technique Type When to Use
Log transform Numeric Skewed distributions
One-hot encoding Categorical Nominal categories (< 20)
Label encoding Categorical Ordinal categories
Cyclical encoding Temporal Hour, month, day of week
Lag features Temporal Time series forecasting
Interaction features Any When features might have combined effects
TF-IDF Text Text classification, NLP

Built by the developers of DodaTech

Doda Browser, DodaZIP & Durga Antivirus Pro