Handling missing data and cleaning datasets is a crucial step in data analysis.
Missing Data
Duplicates
Data Types
import pandas as pd
import numpy as np
# Create a sample DataFrame with missing values
data = {
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
# Check for missing values
print("Missing values:\n", df.isnull())
print("\nCount of missing values:\n", df.isnull().sum())
# Drop rows with any missing values
df_dropped = df.dropna()
print("\nAfter dropping rows with missing values:")
print(df_dropped)
# Fill missing values
df_filled = df.fillna({
'A': df['A'].mean(), # Fill with mean
'B': 0, # Fill with 0
'C': df['C'].ffill() # Forward fill
})
print("\nAfter filling missing values:")
print(df_filled)
# Create a DataFrame with duplicates
data = {
'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'David'],
'Age': [25, 30, 25, 35, 30, 40],
'City': ['NY', 'LA', 'NY', 'CHI', 'LA', 'SF']
}
df = pd.DataFrame(data)
# Check for duplicates
print("Duplicate rows (all columns):")
print(df.duplicated())
# Remove duplicates
df_no_duplicates = df.drop_duplicates()
print("\nAfter removing duplicates:")
print(df_no_duplicates)
# Keep specific duplicates based on columns
df_specific = df.drop_duplicates(subset=['Name', 'Age'])
print("\nKeep first occurrence based on Name and Age:")
print(df_specific)
# Create a sample DataFrame with mixed types
data = {
'A': ['1', '2', '3', '4'],
'B': ['5.1', '6.2', '7.3', '8.4'],
'C': ['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']
}
df = pd.DataFrame(data)
# Check current data types
print("Original data types:")
print(df.dtypes)
# Convert data types
df['A'] = df['A'].astype(int) # Convert to integer
df['B'] = df['B'].astype(float) # Convert to float
df['C'] = pd.to_datetime(df['C']) # Convert to datetime
print("\nConverted data types:")
print(df.dtypes)
# Convert to categorical
df['A'] = df['A'].astype('category')
print("\nAfter converting to category:")
print(df.dtypes)