Table of Contents

Overview

Data processing involves cleaning, transforming, and preparing data for analysis. This guide covers essential techniques using Python libraries.

Data Cleaning

Handling Missing Values

import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df_clean = df.dropna()

# Fill missing values
df['column'] = df['column'].fillna(df['column'].mean())

# Forward fill
df['column'] = df['column'].ffill()

Removing Duplicates

# Check for duplicates
print(df.duplicated().sum())

# Remove duplicates
df_unique = df.drop_duplicates()

# Remove duplicates based on specific columns
df_unique = df.drop_duplicates(subset=['id'])

Data Type Conversion

# Convert data types
df['date'] = pd.to_datetime(df['date'])
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['category'] = df['category'].astype('category')

Data Transformation

String Operations

# String cleaning
df['name'] = df['name'].str.strip()
df['name'] = df['name'].str.lower()
df['name'] = df['name'].str.replace('[^a-zA-Z]', '', regex=True)

# Split strings
df[['first_name', 'last_name']] = df['full_name'].str.split(' ', expand=True)

Numerical Transformations

# Normalization (Min-Max scaling)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['normalized'] = scaler.fit_transform(df[['value']])

# Standardization (Z-score)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['standardized'] = scaler.fit_transform(df[['value']])

# Log transformation
df['log_value'] = np.log1p(df['value'])

Feature Engineering

# Create new features
df['total'] = df['quantity'] * df['price']
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek

# Binning
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

Data Aggregation

Grouping and Aggregating

# Group by and aggregate
summary = df.groupby('category').agg({
    'sales': ['sum', 'mean', 'count'],
    'profit': 'sum'
})

# Pivot tables
pivot = df.pivot_table(
    values='sales',
    index='region',
    columns='product',
    aggfunc='sum',
    fill_value=0
)

Merging and Joining

# Merge dataframes
merged = pd.merge(df1, df2, on='id', how='left')

# Concatenate
combined = pd.concat([df1, df2], axis=0, ignore_index=True)

Performance Optimization

Efficient Data Types

# Use categorical for low-cardinality string columns
df['category'] = df['category'].astype('category')

# Use smaller numeric types
df['small_int'] = df['large_int'].astype('int32')

Chunking Large Files

# Process large files in chunks
chunk_size = 10000
chunks = []

for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
    processed_chunk = process_data(chunk)
    chunks.append(processed_chunk)

df = pd.concat(chunks, ignore_index=True)

Vectorization

# Use vectorized operations instead of loops
# ❌ Slow
for i in range(len(df)):
    df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']

# ✅ Fast
df['result'] = df['a'] * df['b']

Data Validation

Schema Validation

from pandera import Column, DataFrameSchema, Check

schema = DataFrameSchema({
    'user_id': Column(int, Check.greater_than(0)),
    'email': Column(str, Check.str_matches(r'^\S+@\S+\.\S+$')),
    'age': Column(int, Check.in_range(0, 120)),
})

# Validate dataframe
validated_df = schema.validate(df)

Quality Checks

def validate_data_quality(df):
    """Perform data quality checks."""
    issues = []
    
    # Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        issues.append(f"Missing values found: {missing[missing > 0].to_dict()}")
    
    # Check for duplicates
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        issues.append(f"Found {duplicates} duplicate rows")
    
    # Check for outliers
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        outliers = ((df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))).sum()
        if outliers > 0:
            issues.append(f"Found {outliers} outliers in {col}")
    
    return issues

Best Practices

  • Always validate input data
  • Document transformation logic
  • Create reproducible pipelines
  • Handle errors gracefully
  • Profile performance for large datasets
  • Version control data processing scripts
  • Test with sample data before full processing

See Also