Skip to content

Working with CSV, Excel, and SQL in Pandas

DodaTech 3 min read

In this tutorial, you'll learn about Working with CSV, Excel, and SQL in Pandas. We cover key concepts, practical examples, and best practices to help you understand and apply this topic effectively.

What You'll Learn

Read and write data from multiple formats with pandas — CSV, Excel, SQL, JSON, Parquet, and best practices for handling large files.

Why It Matters

Data comes in many formats. Knowing how to read and write each one makes you productive with any dataset you encounter.

Real-World Use

Loading weekly CSV exports from a database, reading Excel reports from stakeholders, querying a PostgreSQL database directly, or saving results to Parquet for fast reloading.

CSV Files

import pandas as pd

# Basic read
df = pd.read_csv("data.csv")

# Common options
df = pd.read_csv(
    "data.csv",
    sep=",",                 # Delimiter (also: \t for TSV)
    header=0,                # Row 0 is column names
    index_col=0,             # First column as index
    usecols=["name", "age"],  # Only read specific columns
    nrows=1000,              # Only read first 1000 rows
    skiprows=[1, 3],         # Skip specific rows
    encoding="utf-8",        # File encoding
    parse_dates=["date"],   # Parse date columns
    na_values=["NA", "N/A", ""],  # Treat as missing
    dtype={"id": int, "price": float},  # Explicit types
)

# Write to CSV
df.to_csv("output.csv", index=False)
df.to_csv("output.csv.gz", index=False, compression="gzip")

Excel Files

# Read Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df = pd.read_excel("data.xlsx", sheet_name=0)  # First sheet

# Read all sheets
all_sheets = pd.read_excel("data.xlsx", sheet_name=None)
for sheet_name, df in all_sheets.items():
    print(f"{sheet_name}: {df.shape}")

# Write to Excel
with pd.ExcelWriter("report.xlsx") as writer:
    df_summary.to_excel(writer, sheet_name="Summary", index=False)
    df_details.to_excel(writer, sheet_name="Details", index=False)

# Format Excel output
with pd.ExcelWriter("formatted.xlsx", engine="xlsxwriter") as writer:
    df.to_excel(writer, sheet_name="Sales", index=False)
    workbook = writer.book
    worksheet = writer.sheets["Sales"]

    # Add formatting
    header_format = workbook.add_format({
        "bold": True,
        "bg_color": "#3498db",
        "font_color": "white",
    })
    for col_num, col_name in enumerate(df.columns):
        worksheet.write(0, col_num, col_name, header_format)

SQL Databases

from sqlalchemy import create_engine

# PostgreSQL
engine = create_engine("postgresql://user:pass@localhost:5432/mydb")

# MySQL
engine = create_engine("mysql+pymysql://user:pass@localhost/mydb")

# SQLite
engine = create_engine("sqlite:///data.db")

# Read full table
df = pd.read_sql_table("customers", engine)

# Read with query
df = pd.read_sql_query(
    "SELECT * FROM orders WHERE amount > 100 ORDER BY date DESC",
    engine,
)

# Write to database
df.to_sql("processed_data", engine, if_exists="replace", index=False)

# Chunked read for large tables
for chunk in pd.read_sql_query("SELECT * FROM large_table", engine, chunksize=10000):
    process_chunk(chunk)

JSON

# Read JSON
df = pd.read_json("data.json")

# Read JSON lines (one JSON object per line)
df = pd.read_json("data.jsonl", lines=True)

# From a JSON string
import json
json_str = '[{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]'
df = pd.read_json(json_str)

# Write to JSON
df.to_json("output.json", orient="records", lines=True)

Parquet (Fast, Columnar)

# Parquet is much faster and smaller than CSV
df = pd.read_parquet("data.parquet")
df.to_parquet("output.parquet", compression="snappy")

# Read specific columns only
df = pd.read_parquet("data.parquet", columns=["id", "name"])

Working with Large Files

# Read in chunks
chunks = []
for chunk in pd.read_csv("large_file.csv", chunksize=50000):
    # Process each chunk
    chunk = chunk[chunk["amount"] > 0]
    chunks.append(chunk)

result = pd.concat(chunks)

# Read specific columns to reduce memory
df = pd.read_csv("large_file.csv", usecols=["id", "name", "amount"])

# Specify dtypes to save memory
df = pd.read_csv(
    "large_file.csv",
    dtype={
        "id": "int32",
        "category": "category",
        "amount": "float32",
    },
)

# Memory usage comparison
df.info(memory_usage="deep")

File Size Comparison

import numpy as np

# Create a large dataset
n = 1000000
df = pd.DataFrame({
    "id": range(n),
    "value": np.random.randn(n),
    "category": np.random.choice(["A", "B", "C", "D"], n),
})

# Save in different formats
df.to_csv("data.csv", index=False)           # ~35 MB
df.to_parquet("data.parquet")                # ~4 MB  (10× smaller!)
df.to_pickle("data.pkl")                     # ~16 MB

# Parquet + compression is usually the best choice

Quick Reference

Format Read Write Best For
CSV read_csv() to_csv() Universal, human-readable
Excel read_excel() to_excel() Business reports
SQL read_sql() to_sql() Database integration
JSON read_json() to_json() API data
Parquet read_parquet() to_parquet() Performance, storage
Pickle read_pickle() to_pickle() Python-specific, fast

Built by the developers of DodaTech

Doda Browser, DodaZIP & Durga Antivirus Pro