Data pre-processing
Some transformations
# Changing data types
for i in raw_df1.columns:
if i == 'Date':
raw_df1[i] = raw_df1[i].astype('datetime64[ns]')
elif raw_df1[i].dtype == 'object':
raw_df1[i] = raw_df1[i].astype('category')
df1 = raw_df1.copy()
df1['Date'] = pd.to_datetime(df1['Date'])
df1['month'] = df1['Date'].dt.month
df1['Spring'] = df1['month'].between(3,5,inclusive='both')
df1['Summer'] = df1['month'].between(6,8,inclusive='both')
df1['Fall'] = df1['month'].between(9,11,inclusive='both')
# df1['Winter'] = df1['month'].between(12,2,inclusive='both')
df1.Spring = df1.Spring.replace({True: 1, False: 0})
df1.Summer = df1.Summer.replace({True: 1, False: 0})
df1.Fall = df1.Fall.replace({True: 1, False: 0})
# Encoding labels for 'type' from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df1['type'] = le.fit_transform(df1['type']) # Encoding 'region' (One Hot Encoding) from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(drop='first', handle_unknown='ignore') ohe = pd.get_dummies(data=df1, columns=['region']) df1 = ohe.drop(['Date','4046','4225','4770','Small Bags','Large Bags','XLarge Bags'], axis=1)
Outlier detection and removal
We have a significant problems with outliers in both data sets:
-
most of the distributions are not normal;
-
huge outliers;
-
higly right-skeved data in Avocado Prices data set;
-
a lot of outliers.
Tukey’s (1977) technique is used to detect outliers in skewed or non bell-shaped data since it makes no distributional assumptions. However, Tukey’s method may not be appropriate for a small sample size. The general rule is that anything not in the range of (Q1 - 1.5 IQR) and (Q3 + 1.5 IQR) is an outlier, and can be removed.
Inter Quartile Range (IQR) is one of the most extensively used procedure for outlier detection and removal.
Procedure:
- Find the first quartile, Q1.
- Find the third quartile, Q3.
- Calculate the IQR. IQR = Q3-Q1.
- Define the normal data range with lower limit as Q1–1.5 IQR and upper limit as Q3+1.5 IQR.
For oulier detection methods look here: https://www.kaggle.com/code/marcinrutecki/outlier-detection-methods
def IQR_method (df,n,features):
"""
Takes a dataframe and returns an index list corresponding to the observations
containing more than n outliers according to the Tukey IQR method.
"""
outlier_list = []
for column in features:
# 1st quartile (25%)
Q1 = np.percentile(df[column], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[column],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determining a list of indices of outliers
outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step )].index
# appending the list of outliers
outlier_list.extend(outlier_list_column)
# selecting observations containing more than x outliers
outlier_list = Counter(outlier_list)
multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
# Calculate the number of records below and above lower and above bound value respectively
df1 = df[df[column] < Q1 - outlier_step]
df2 = df[df[column] > Q3 + outlier_step]
print('Total number of deleted outliers:', df1.shape[0]+df2.shape[0])
return multiple_outliers
numeric_columns2 = ['Total Volume', 'Total Bags'] Outliers_IQR = IQR_method(df1,1,numeric_columns2) # dropping outliers df1 = df1.drop(Outliers_IQR, axis = 0).reset_index(drop=True)
Total number of deleted outliers: 2533
numeric_columns2 = ['CRIM', 'ZN', 'NOX', 'RM', 'AGE', 'DIS', 'PTRATIO', 'B', 'LSTAT'] Outliers_IQR = IQR_method(raw_df2,1,numeric_columns2) # dropping outliers df2 = raw_df2.drop(Outliers_IQR, axis = 0).reset_index(drop=True)
Total number of deleted outliers: 7
Train test split
X = df1.drop('AveragePrice', axis=1)
y = df1['AveragePrice']
X2 = raw_df2.iloc[:, :-1]
y2 = raw_df2.iloc[:, -1]
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.3, random_state = 42)
Feature scaling
from sklearn.preprocessing import StandardScaler
# Creating function for scaling
def Standard_Scaler (df, col_names):
features = df[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features
return df
col_names = ['Total Volume', 'Total Bags'] X_train = Standard_Scaler (X_train, col_names) X_test = Standard_Scaler (X_test, col_names) col_names = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] X_train2 = Standard_Scaler (X_train2, col_names) X_test2 = Standard_Scaler (X_test2, col_names)