from pandas import read_csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
filename = "./data/pima-indians-diabetes.data"
orig_df = read_csv(filename, sep=',', decimal='.',header = 0)

#Converting zeros into NaN and replace missing values with the mean
na_df = orig_df[['glucose','dBP','skinfold','insuline','BMI']].replace(0,np.NaN)
na_df[['tpregnant','dpf','age','isDiabetic']] = orig_df[['tpregnant','dpf','age','isDiabetic']]
df = na_df.fillna(na_df.mean())

df.skew()

sns.displot(df['dpf'],bins=20)

ax = sns.displot(np.log(df['dpf']),bins=20)
ax.set(xlabel='log(dpf)')

The transformation seemed to work. This 'almost' dual peak bothers me as we want the variable to be unimodal. But probably it is ok as it is not a very well-defined dual peak.  
  
Let's derive the new variable **log_dpf** and this will be the variable that you would like to input in any machine learning algorithm.

df['log_dpf']=np.log(df['dpf'])
print('Skewness of log_dpf: ', df['log_dpf'].skew())

# Getting the number of point to generate from the normal distribution
numOfPoints = len(df['log_dpf'])

# Getting the mean and standard deviation from the distribution of the transformed variable
mean = df['log_dpf'].mean()
sdev = df['log_dpf'].std()

# Generate specified number of point from a 'True' normal distribution with the same mean and standard deviation
ncurve = np.random.normal(mean,sdev,numOfPoints)

# Generate a sample of 100 percentiles to compare
percs = np.linspace(0,100,100)

# Generate the series of quintiles using the sample
qn_transformed = np.percentile(df['log_dpf'], percs)
qn_normal = np.percentile(ncurve, percs)

# Generate the Quintile-Quintile Plot (QQPlot)
plt.plot(qn_transformed,qn_normal, ls="", marker="o")

# Display the ideal line for reference
x = np.linspace(np.min((qn_transformed.min(),qn_normal.min())), np.max((qn_transformed.max(),qn_normal.max())))
plt.plot(x,x, color="k", ls="--")

plt.show()

# Getting the number of point to generate from the normal distribution
numOfPoints = len(df['dpf'])

# Getting the mean and standard deviation from the distribution of the transformed variable
mean = df['dpf'].mean()
sdev = df['dpf'].std()

# Generate specified number of point from a 'True' normal distribution with the same mean and standard deviation
ncurve = np.random.normal(mean,sdev,numOfPoints)

# Generate a sample of 100 percentiles to compare
percs = np.linspace(0,100,100)

# Generate the series of quintiles using the sample
qn_transformed = np.percentile(df['dpf'], percs)
qn_normal = np.percentile(ncurve, percs)

# Generate the Quintile-Quintile Plot (QQPlot)
plt.plot(qn_transformed,qn_normal, ls="", marker="o")

# Display the ideal line for reference
x = np.linspace(np.min((qn_transformed.min(),qn_normal.min())), np.max((qn_transformed.max(),qn_normal.max())))
plt.plot(x,x, color="k", ls="--")

plt.show()

from pandas import read_csv
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
columns = ["symboling", "norm_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Let's specify the delimiter, set the decimal point character, set the columns names and convert the '?' as NaN
filename = "./data/imports-85.data"
df = read_csv(filename, sep=',', decimal='.',names=columns, na_values="?")

df.head(5)

df.dtypes

df.select_dtypes(include=['object'])

df['body_style'].value_counts()

df[df['body_style'].isnull()]

newdf = pd.get_dummies(df, columns=['body_style'], prefix=['style'])

newdf.head(5)

df['is_sedan'] = np.where(df['body_style'].str.contains('sedan'),1,0)
df[['body_style','is_sedan']].head(10)

df['num_doors'].value_counts()

doors_map={'two':2,
           'four':4}

df['num_doors'] = df['num_doors'].map(doors_map)

df.head(5)

df['num_doors'] = df['num_doors'].fillna(df['num_doors'].mode()[0])

df['num_doors'].isnull().sum()

df.astype({'num_doors': 'int32'}).dtypes

df['highway_mpg'].describe()

# Because we have 4 bins we need 5 intervals
categories = ['low','average','high','very high']

# You can specify your own intervals, I used linspace to be more precise
bins = np.linspace(df['highway_mpg'].min(), df['highway_mpg'].max(), len(categories)+1)
print(bins)

df['mpg_bins'] = pd.cut(df['highway_mpg'], bins=bins, labels=categories, include_lowest=True)

# Let's sort the value so the categories are displayed from 'low' to 'very high'
df.sort_values('highway_mpg', inplace=True)
plt.hist(df['mpg_bins'], bins=4)

def get_labels(df, column, k=2):
    intervals = list(set(pd.qcut(df[column], q=k, precision=1).tolist()))
    labels = []
    for interval in intervals:
        sinterval = ''
        if interval.closed_left:
            sinterval+='['
        else:
            sinterval+='('
        sinterval+=str(interval.left)+','+str(interval.right)
        if interval.closed_right:
            sinterval+=']'
        else:
            sinterval+=')'
        labels.append(sinterval)
    return labels

categories = get_labels(df, 'highway_mpg',4)
print(categories)

df['mpg_bins'] = pd.qcut(df['highway_mpg'], q=4, precision=1, labels=categories)

# Let's sort the value so the bins intervals are displayed from the smallest to the highest
df.sort_values('mpg_bins', inplace=True)
ax = plt.hist(df['mpg_bins'], bins=4, edgecolor = "black")

df['aspiration'].value_counts()

import pandas as pd
import numpy

filename = "./data/pima-indians-diabetes.data"
dataset = pd.read_csv(filename, sep=',', decimal='.',header = 0)
print(dataset.describe())

# Counting Missing values for each column
dataset.isnull().sum()

# Counting Missing values for each row
dataset.isnull().sum(axis=1)

dataset.head(10)

print ("Before removal: "+str(len(na_ds)))
na_ds_removed = na_ds.dropna()
print ("After removal: "+str(len(na_ds_removed)))
print ("Total records removed: "+str(len(na_ds)-len(na_ds_removed)))

#Converting zeros into NaN
na_ds = dataset[['glucose','dBP','skinfold','insuline','BMI']].replace(0,numpy.NaN)
na_ds[['tpregnant','dpf','age','isDiabetic']] = dataset[['tpregnant','dpf','age','isDiabetic']]
na_ds.head(10)

print ("Before removal: "+str(len(na_ds)))
na_ds_removed = na_ds.dropna()
print ("After removal: "+str(len(na_ds_removed)))
print ("Total records removed: "+str(len(na_ds)-len(na_ds_removed)))

na_mean_ds = na_ds.fillna(na_ds.mean()) 
na_mean_ds.head(10)

#Converting zeros into NaN
na_ds = dataset[['glucose','dBP','skinfold','insuline','BMI']].replace(0,numpy.NaN)
na_ds[['tpregnant','dpf','age','isDiabetic']] = dataset[['tpregnant','dpf','age','isDiabetic']]
na_ds.head(10)

print ("Before removal: "+str(len(na_ds)))
na_ds_removed = na_ds.dropna()
print ("After removal: "+str(len(na_ds_removed)))
print ("Total records removed: "+str(len(na_ds)-len(na_ds_removed)))

na_mean_ds = na_ds.fillna(na_ds.mean())
na_mean_ds.head(10)

# Let's confirm we don't have any missing values
print(na_mean_ds.isnull().sum())

dataset[['dBP','skinfold']].describe()

na_mean_ds[['dBP','skinfold']].describe()

# Function to replace NaNs with random samples from the column
def replace_nans_with_samples(column):
    # Remove NaNs from the column
    non_nan_values = column.dropna()

    # Randomly sample from non-NaN values and replace NaNs
    column.fillna(pd.Series(np.random.choice(non_nan_values, size=len(column.index))), inplace=True)

# Apply the function to each column
na_random_ds = na_ds.copy()  # create a copy of the dataframe
na_random_ds.apply(lambda col: replace_nans_with_samples(col))

na_random_ds.head(5)

dataset[['dBP','skinfold']].describe()

na_random_ds[['dBP','skinfold']].describe()

from pandas import read_csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
filename = "./data/pima-indians-diabetes.data"
orig_df = read_csv(filename, sep=',', decimal='.',header = 0)

#Converting zeros into NaN and replace missing values with the mean
na_df = orig_df[['glucose','dBP','skinfold','insuline','BMI']].replace(0,np.NaN)
na_df[['tpregnant','dpf','age','isDiabetic']] = orig_df[['tpregnant','dpf','age','isDiabetic']]

df = na_df.fillna(na_df.mean())
print(df.describe())

%matplotlib inline  # % magic function 
# TODO create a better visualization with the count of each bar / so we can see better the outliers 
# //! Important information
sns.set_style(style='white')
ax = sns.displot(df["skinfold"],bins=20)

# How to plot percentage with seaborn distplot / histplot / displot 
# link https://stackoverflow.com/questions/63373194/how-to-plot-percentage-with-seaborn-distplot-histplot-displot

fg = sns.displot(data=data, x='age', stat='percent', col='sex', height=3.5, aspect=1.25)

for ax in fg.axes.ravel():
    
    # add annotations
    for c in ax.containers:

        # custom label calculates percent and add an empty string so 0 value bars don't have a number
        labels = [f'{w:0.1f}%' if (w := v.get_height()) > 0 else '' for v in c]

        ax.bar_label(c, labels=labels, label_type='edge', fontsize=8, rotation=90, padding=2)
    
    ax.margins(y=0.2)

plt.show()

# jointplot of log_value_confirmed and log_value_deaths  
# //! needs review 
# TODO 
sns.jointplot(x=’log_value_confirmed’,y=’log_value_deaths’,
data=df_cases_scatter)
#show the plot
plt.show()

## section 1 
data = np.random.default_rng(123).rayleigh(1, 70)
counts, edges, bars = plt.hist(data)

## section 2 
fig, ax = plt.subplots()
counts, edges, bars = ax.hist([data, data * 0.3], histtype='barstacked')

for b in bars:
    ax.bar_label(b)

# // TODO
# //! This is alert
# TODO 
# * * Important information
# //* This is highlight
# //? This is query

ax = df.plot.scatter(x='age',y='skinfold')

field = 'skinfold'
plt.figure(figsize=(10,8))

plt.subplot(211)
plt.xlim(df[field].min()-1, df[field].max()*1.1)
ax = df[field].plot(kind='kde')

plt.subplot(212)
plt.xlim(df[field].min()-1, df[field].max()*1.1)
ax = sns.boxplot(x=df[field])

from pandas import read_csv
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Will display any number with 4 decimal points instead of the scientific notation
pd.options.display.float_format = '{:,.4f}'.format

# Read the data
filename = "./data/pima-indians-diabetes.data"
orig_df = read_csv(filename, sep=',', decimal='.',header = 0)

#Converting zeros into NaN and replace missing values with the mean
na_df = orig_df[['glucose','dBP','skinfold','insuline','BMI']].replace(0,np.NaN)
na_df[['tpregnant','dpf','age','isDiabetic']] = orig_df[['tpregnant','dpf','age','isDiabetic']]

df = na_df.fillna(na_df.mean())

print(f"'dBP' statistics: mean ({df['dBP'].mean():.4f}), std ({df['dBP'].std():.4f})") # //! display 4 decimals points / pandas 
print(f"'skinfold' statistics: mean ({df['skinfold'].mean():.4f}), std ({df['skinfold'].std():.4f})")

fig, ax = plt.subplots()
sns.histplot(df["dBP"],bins=20, ax=ax)
sns.histplot(df["skinfold"][df['skinfold']<60],bins=20, ax=ax)

df['mMskinfold']=(df['skinfold']-df['skinfold'].min())/(df['skinfold'].max()-df['skinfold'].min())
df['mMdBP']=(df['dBP']-df['dBP'].min())/(df['dBP'].max()-df['dBP'].min())

df['zskinfold'] = (df['skinfold']-df['skinfold'].mean())/df['skinfold'].std()
df['zdBP'] = (df['dBP']-df['dBP'].mean())/df['dBP'].std()

fig, ax = plt.subplots()
sns.histplot(df["zskinfold"][df['zskinfold']<4],bins=20, ax=ax)
sns.histplot(df["zdBP"],bins=20, ax=ax)

def get_ndigits(maxvalue):
    return round(math.log(maxvalue,10))

n=get_ndigits(df['skinfold'].abs().max())
df['dskinfold'] = df['skinfold']/10**n

n=get_ndigits(df['dBP'].abs().max())
df['ddBP'] = df['dBP']/10**n

fig, ax = plt.subplots()
sns.histplot(df["dskinfold"],bins=20, ax=ax)
sns.histplot(df["ddBP"],bins=20, ax=ax)

df[['skinfold','mMskinfold','zskinfold','dskinfold']].head(10)

# Let's display candidate outliers with the zscore method
coutliers = df['zskinfold'][(df['zskinfold']<-3) | (df['zskinfold']>3)]
print(coutliers.tolist())

mean = df['skinfold'].mean()
stdev = df['skinfold'].std()

print('Original_value of 7.94528970815517 is ', (7.94528970815517*stdev)+mean)

df['skinfold'][df['skinfold']>=99]

df['skinfold'].sort_values()

def standardize(df, column):
    mean = df[column].mean()
    stdev = df[column].std()
    df['z_'+column] = (df[column]-mean)/stdev
    return df

def get_zoutliers(df):
    columns = df.columns
    result = {}
    for column in columns:
        # Standardize Column
        df = standardize(df, column)
        coutliers = df['z_'+column][(df['z_'+column]<-3) | (df['z_'+column]>3)]
        result[column]=coutliers.tolist()
    return result

coutliers = get_zoutliers(df)

print(coutliers)

def iqr_outliers(df, column):
  q1 = df[column].quantile(0.25)
  q3 = df[column].quantile(0.75)
  iqr = q3 - q1
  outliers = df[column][(df[column] < (q1 - 1.5 * iqr)) | (df[column] > (q3 + 1.5 * iqr))]
  return outliers

columns = df.columns
outlier_candidates = {}

for c in columns:
  cos = iqr_outliers(df, c)
  outlier_candidates[c] = {'n':len(cos), 'candidates':list(cos)}
print(outlier_candidates)

Wheat Seeds Analysis¶

0. Description¶

1. Environment set up¶

Configuration¶

Libraries imports¶

2. Data Dictionnary¶

3. Data Preparation¶

3. Data Pre-processing¶

Changing the Shape of Variables¶

Changing Shape to Achieve 'Normality'¶

Checking Normality¶

Comparison with the original variable¶

3.3 Transforming Categorical to Numerical¶

Custom Mapping¶

Numeric to Categorical - Equal Width Binning¶

Numeric to Categorical - Equal Frequency Binning¶

Categorical to Single Binary Variable¶

Data Preprocessing

Handling Missing Data

Data cleaning¶

Duplicate records¶

Missing values¶

l¶

Removing = records containing missing values

Inputing missing values with the mean

Removing records containing missing values

Mean = Inputing missing values with the mean

Random = Inputing missing values with the random values from the distribution

Visualizaiton Outliers & detection¶

Part I - Visualizing Outliers¶

Outlier detection using histogram¶

import the necessary python packages¶

read the dataset using pandas read_csv¶

function¶

group the multi level categorical variables¶

and reset_ the index to flatten the index¶

use sns barplot to plot bar plot¶

between days and tip value¶

now simply assign the bar values to¶

each bar by passing containers method¶

to bar_label function¶

Outlier Detection using Scattered Plot¶

Using the Box Plot¶

Data Transformation¶

Part I - Normalizing & Standardizing your Data¶

Reading the data¶

Min-Max Transformation¶

Z-Score Transformation¶

Decimal Scaling¶

Displaying all Transformations¶

Part II - Numeric Method to Identify Outliers¶

Using ZScore to identify Outliers¶

Using Interquartile Range (IQR)¶

4. EDA (Exploration data analysis)¶

4.1 Correlation between Variables¶

4.2 Outlier & Relationships between between target and other variables¶

5. K-Means Clustering¶

5.1 Finding Optimal number cluster¶

Elbow Method¶

Silhouette Method¶

Dunn's Method¶

Find K-means Cluster¶

6. Hierarchical Clustering¶

6.1 Finding Optimal number of clusters¶

Elbow Method¶

Dunn's Method¶

Cluster Dendrogram¶

7. Gaussian Mixture Model Clustering¶