Unit-VI: Data Analysis with Python

Lecture 4: Advanced DataFrame Operations

Estimated: 50-65 minutes

1. Data Cleaning

Handling missing data and cleaning datasets is a crucial step in data analysis.

Missing Data
Duplicates
Data Types
import pandas as pd
import numpy as np

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

# Check for missing values
print("Missing values:\n", df.isnull())
print("\nCount of missing values:\n", df.isnull().sum())

# Drop rows with any missing values
df_dropped = df.dropna()
print("\nAfter dropping rows with missing values:")
print(df_dropped)

# Fill missing values
df_filled = df.fillna({
    'A': df['A'].mean(),  # Fill with mean
    'B': 0,                # Fill with 0
    'C': df['C'].ffill()   # Forward fill
})
print("\nAfter filling missing values:")
print(df_filled)
# Create a DataFrame with duplicates
data = {
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'David'],
    'Age': [25, 30, 25, 35, 30, 40],
    'City': ['NY', 'LA', 'NY', 'CHI', 'LA', 'SF']
}
df = pd.DataFrame(data)

# Check for duplicates
print("Duplicate rows (all columns):")
print(df.duplicated())

# Remove duplicates
df_no_duplicates = df.drop_duplicates()
print("\nAfter removing duplicates:")
print(df_no_duplicates)

# Keep specific duplicates based on columns
df_specific = df.drop_duplicates(subset=['Name', 'Age'])
print("\nKeep first occurrence based on Name and Age:")
print(df_specific)
# Create a sample DataFrame with mixed types
data = {
    'A': ['1', '2', '3', '4'],
    'B': ['5.1', '6.2', '7.3', '8.4'],
    'C': ['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']
}
df = pd.DataFrame(data)

# Check current data types
print("Original data types:")
print(df.dtypes)

# Convert data types
df['A'] = df['A'].astype(int)  # Convert to integer
df['B'] = df['B'].astype(float)  # Convert to float
df['C'] = pd.to_datetime(df['C'])  # Convert to datetime

print("\nConverted data types:")
print(df.dtypes)

# Convert to categorical
df['A'] = df['A'].astype('category')
print("\nAfter converting to category:")
print(df.dtypes)
2. Data Transformation

Transforming and manipulating data for analysis.

Apply & Map
Pivot Tables
Melting
# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Salary': [75000, 85000, 90000, 65000],
    'Experience': [3, 5, 7, 2]
}
df = pd.DataFrame(data)

# Using apply with a lambda function
df['Bonus'] = df['Salary'].apply(lambda x: x * 0.1)  # 10% bonus

# Using apply with a defined function
def experience_level(years):
    if years < 3:
        return 'Junior'
    elif years < 5:
        return 'Mid'
    else:
        return 'Senior'

df['Level'] = df['Experience'].apply(experience_level)

# Using map for element-wise transformations
title_map = {'Alice': 'Ms.', 'Bob': 'Mr.', 'Charlie': 'Mr.', 'David': 'Mr.'}
df['Title'] = df['Name'].map(title_map)

print(df)
# Sample sales data
sales = {
    'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'],
    'Product': ['A', 'B', 'A', 'B', 'A'],
    'Region': ['North', 'South', 'East', 'West', 'North'],
    'Revenue': [100, 150, 200, 120, 180],
    'Units': [10, 15, 20, 12, 18]
}
df = pd.DataFrame(sales)

# Create a pivot table
pivot = pd.pivot_table(
    df,
    values='Revenue',
    index='Date',
    columns='Product',
    aggfunc='sum',
    fill_value=0
)
print("Pivot Table - Revenue by Date and Product:")
print(pivot)

# Multi-level pivot
pivot_multi = pd.pivot_table(
    df,
    values=['Revenue', 'Units'],
    index=['Date', 'Region'],
    columns='Product',
    aggfunc={'Revenue': 'sum', 'Units': 'sum'},
    fill_value=0
)
print("\nMulti-level Pivot Table:")
print(pivot_multi)
# Sample wide-format data
wide_data = {
    'Country': ['USA', 'Canada', 'Mexico'],
    '2020': [21.43, 1.99, 1.65],
    '2021': [22.32, 2.10, 1.72],
    '2022': [23.32, 2.20, 1.81]
}
df_wide = pd.DataFrame(wide_data)

# Melt to long format
df_long = pd.melt(
    df_wide,
    id_vars=['Country'],
    value_vars=['2020', '2021', '2022'],
    var_name='Year',
    value_name='GDP (Trillions)'
)
print("Melted DataFrame (Long Format):")
print(df_long)

# Pivot back to wide format
df_wide_again = df_long.pivot(
    index='Country',
    columns='Year',
    values='GDP (Trillions)'
).reset_index()
print("\nBack to Wide Format:")
print(df_wide_again)
3. Grouping and Aggregation

Grouping data and computing aggregate statistics.

# Sample sales data
sales_data = {
    'Date': pd.date_range(start='2023-01-01', periods=12, freq='M'),
    'Region': ['North', 'South', 'East', 'West'] * 3,
    'Product': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Revenue': [100, 150, 200, 120, 180, 90, 220, 130, 190, 110, 210, 140],
    'Units': [10, 15, 20, 12, 18, 9, 22, 13, 19, 11, 21, 14]
}
df = pd.DataFrame(sales_data)

# Basic grouping
grouped = df.groupby('Region')
print("Average revenue by region:")
print(grouped['Revenue'].mean())

# Multiple aggregations
print("\nMultiple aggregations by region:")
print(grouped.agg({
    'Revenue': ['sum', 'mean', 'max', 'min'],
    'Units': 'sum'
}))

# Group by multiple columns
print("\nGroup by Region and Product:")
multi_group = df.groupby(['Region', 'Product']).agg({
    'Revenue': ['sum', 'mean'],
    'Units': 'sum'
})
print(multi_group)

# Using transform to create group-level statistics
df['Region_Avg_Revenue'] = df.groupby('Region')['Revenue'].transform('mean')
print("\nDataFrame with Region Average Revenue:")
print(df[['Date', 'Region', 'Product', 'Revenue', 'Region_Avg_Revenue']])
4. Time Series Operations

Working with time series data in Pandas.

# Create a time series
date_rng = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
df = pd.DataFrame(date_rng, columns=['date'])
df['data'] = np.random.randint(1, 100, size=(len(date_rng)))
df.set_index('date', inplace=True)

# Resample to monthly frequency
monthly = df.resample('M').mean()
print("Monthly averages:")
print(monthly.head())

# Rolling windows
rolling_avg = df.rolling(window=7).mean()
print("\n7-day rolling average:")
print(rolling_avg.head(10))

# Time shifting
df['prev_day'] = df['data'].shift(1)  # Previous day's value
df['next_day'] = df['data'].shift(-1)  # Next day's value
print("\nOriginal data with shifted columns:")
print(df.head())

# Time zone handling
df_tz = df.tz_localize('UTC').tz_convert('US/Eastern')
print("\nTime zone converted data:")
print(df_tz.head())
5. Practice Exercise

Given the following sales dataset:

data = {
    'Date': pd.date_range(start='2023-01-01', periods=20, freq='D'),
    'Product': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 
                'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Category': ['Electronics', 'Furniture'] * 10,
    'Revenue': [100, 200, 150, 250, 120, 230, 180, 270, 200, 290,
                220, 310, 180, 270, 160, 250, 140, 230, 120, 210],
    'Units': [10, 5, 8, 6, 12, 7, 9, 8, 10, 9, 11, 10, 9, 8, 8, 7, 7, 6, 6, 5]
}
sales = pd.DataFrame(data)

Perform the following tasks:

  1. Calculate the total revenue and units sold for each product
  2. Find the average revenue per unit for each category
  3. Create a pivot table showing total revenue by product and category
  4. Add a 7-day moving average of revenue to the DataFrame
  5. Find the date with the highest revenue for each product