The most commonly used functions in machine learning data cleaning
1. Get null percentage of columns
def get_null_percentage(df):
count = df.isnull().sum()
percent = (df.isnull().mean() * 100).round(2)
summary = pd.DataFrame({'count': count, 'percentage': percent})
return summary[summary["count"] > 0].sort_values(ascending=False, by="percentage")
2. Get zero percentage of columns
def get_zero_percentage(df):
count = df.eq(0).sum()
percent = (df.eq(0).mean() * 100).round(2)
summary = pd.DataFrame({'count': count, 'percentage': percent})
return summary[summary["count"] > 0].sort_values(ascending=False, by="percentage")
3. Remove Outliers for one column
3.1 using IQR
def remove_outliers_by_iqr(df, column):
q1 = df[column].quantile(0.25)
q3 = df[column].quantile(0.75)
value = q3 - q1
lower_value = q1 - 1.5 * value
higer_value = q3 + 1.5 * value
return df[(df[column] < higer_value) & (df[column] > lower_value)]
This might end up with an empty DataFrame if all or a significant portion of your data points are considered outliers and removed. e.g.,
We can use z-score to solve it.
3.2 using Z-Score
def remove_outliers_by_zscore(df, column_name, z_threshold=3):
# Calculate the Z-scores for the specified column
z_scores = (df[column_name] - df[column_name].mean()) / df[column_name].std()
# Filter out rows where the Z-score is greater than the threshold
return df[abs(z_scores) <= z_threshold]
4 Get extreme distribution columns
def check_extreme_distribution(df, threshold=0.95):
extreme_columns = []
for column in df.columns:
mode_value = df[column].mode().iloc[0] # first value, it is also the most contributed value
mode_percentage = (df[column] == mode_value).sum() / len(df)
if mode_percentage >= threshold:
extreme_columns.append(column)
return extreme_columns
5 Get same percentile columns
def get_same_percentile_columns(df, value):
column_stats = df.describe()
return column_stats.columns[(column_stats.loc['min'] == value) &
(column_stats.loc['25%'] == value) &
(column_stats.loc['50%'] == value) &
(column_stats.loc['75%'] == value)]
We can use it to check the columns that all percentiles are 0
:
get_percentile_same_columns(df, 0)