import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.manifold import TSNE
from scipy.stats import skew
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# to Ignore all warnings from the system
warnings.filterwarnings("ignore")


# load the dataset into pandas dataframe
def load_data(filepath):
    return pd.read_csv(filepath)

full_dataset = load_data("df_training_level1.csv")


# display the first 10 observations of the dataframe
full_dataset.head()


# get the dimensions of the dataset

def get_rows_columns(dataset):
    rows, columns = dataset.shape
    print("No of Observations:", rows)
    print("No of Variables:", columns)
get_rows_columns(full_dataset)

No of Observations: 746
No of Variables: 1294


# getting informations from the dataset
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746 entries, 0 to 745
Columns: 1294 entries, Info_PepID to feat_esm1b_1279
dtypes: float64(1280), int64(4), object(10)
memory usage: 7.4+ MB


# describe the dataset
full_dataset.describe()


# Now, calculate the percentage of missing values, such that we can determine how to deal with variables having a high count
def get_missing_values(dataset):
    return dataset.isnull().sum()/len(dataset)*100
get_missing_values(full_dataset)

Info_PepID          0.0
Info_organism_id    0.0
Info_protein_id     0.0
Info_pos            0.0
Info_AA             0.0
                   ... 
feat_esm1b_1275     0.0
feat_esm1b_1276     0.0
feat_esm1b_1277     0.0
feat_esm1b_1278     0.0
feat_esm1b_1279     0.0
Length: 1294, dtype: float64


# print the total null values
print("Total null values:", get_missing_values(full_dataset).sum())

Total null values: 0.0


def class_distribution(dataset):
    # Take a random sample for visualization purposes
    sample = dataset.sample(n=233, random_state=42)
    # Visualize the distribution of the target variable
    sns.countplot(data=sample, x="Class")
    plt.title("Class Distribution")
    plt.show()
class_distribution(full_dataset)


def get_info_cluster(dataset, column1, column2):
    
    grouped = dataset.groupby([column1, column2]).size()
    # Create a new dataframe to store the counts
    counts_df = pd.DataFrame(grouped, columns=['count']).reset_index()
    
    # Pivot the table to get the desired format
    pivoted = counts_df.pivot(index=column1, columns=column2, values='count').reset_index()
    
    # Rename the columns
    pivoted.columns = [column1, '1', '-1']
    
    # Fill any missing values with zero
    pivoted = pivoted.fillna(0)
    
    # Convert the count columns to integer type
    pivoted['1'] = pivoted['1'].astype(int)
    pivoted['-1'] = pivoted['-1'].astype(int)
    
    return pivoted
get_info_cluster(full_dataset,'Info_cluster', 'Class')


EDA = full_dataset.iloc[:, 14:]
Class = full_dataset.iloc[:, 13]
cluster = full_dataset.iloc[:, 12]
cluster.head()

0    199
1    199
2    199
3    199
4    199
Name: Info_cluster, dtype: int64


def plot_tsne(dataset):
    X = dataset.values
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X)
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
    plt.show()
plot_tsne(EDA)


def skewness(dataset):
    # Calculate the skewness of each feature
    skewness = dataset.apply(lambda x: skew(x))

    # Calculate the mean and standard deviation of the skewness values
    mean = skewness.mean()
    std = skewness.std()

    # Set the threshold to be 3 standard deviations away from the mean
    # Idea from 3-sigma rule
    threshold = mean + 3 * std

    # Identify features with high skewness values
    high_skewness_features = skewness[skewness > threshold].index

    
    return high_skewness_features

# skewness features
high_skewness_features = skewness(EDA)

# Print the names of the features with high skewness values
print("Features with high skewness:")
print(high_skewness_features)

Features with high skewness:
Index(['feat_esm1b_36', 'feat_esm1b_450', 'feat_esm1b_847', 'feat_esm1b_869',
       'feat_esm1b_1275'],
      dtype='object')


def display_outliers_iqr(data, factor=1.5):
    # Calculate Q1, Q3, and IQR for each column
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define the upper and lower bounds
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    # Identify outliers
    outliers = ((data < lower_bound) | (data > upper_bound))

    # Get the outlier values for each column
    outlier_values = data[outliers]

    return outlier_values

# Apply the IQR method to display the outliers
outliers = display_outliers_iqr(EDA)

print("Outliers:")
print(outliers)

Outliers:
     feat_esm1b_0  feat_esm1b_1  feat_esm1b_2  feat_esm1b_3  feat_esm1b_4  \
0             NaN           NaN       0.46473           NaN           NaN   
1             NaN           NaN           NaN           NaN           NaN   
2             NaN           NaN           NaN           NaN           NaN   
3             NaN           NaN           NaN           NaN           NaN   
4             NaN           NaN           NaN           NaN           NaN   
..            ...           ...           ...           ...           ...   
741           NaN           NaN           NaN           NaN           NaN   
742           NaN           NaN           NaN           NaN           NaN   
743           NaN           NaN           NaN           NaN           NaN   
744           NaN           NaN           NaN           NaN           NaN   
745           NaN           NaN           NaN           NaN           NaN   

     feat_esm1b_5  feat_esm1b_6  feat_esm1b_7  feat_esm1b_8  feat_esm1b_9  \
0             NaN           NaN           NaN           NaN           NaN   
1             NaN           NaN           NaN           NaN           NaN   
2             NaN           NaN           NaN           NaN           NaN   
3             NaN           NaN           NaN           NaN           NaN   
4             NaN           NaN           NaN           NaN           NaN   
..            ...           ...           ...           ...           ...   
741           NaN           NaN           NaN           NaN           NaN   
742           NaN           NaN           NaN           NaN           NaN   
743           NaN           NaN           NaN           NaN           NaN   
744           NaN           NaN           NaN           NaN           NaN   
745           NaN           NaN           NaN           NaN      0.433676   

     ...  feat_esm1b_1270  feat_esm1b_1271  feat_esm1b_1272  feat_esm1b_1273  \
0    ...              NaN              NaN              NaN              NaN   
1    ...         0.539347              NaN              NaN              NaN   
2    ...              NaN              NaN              NaN              NaN   
3    ...              NaN              NaN              NaN              NaN   
4    ...              NaN              NaN              NaN              NaN   
..   ...              ...              ...              ...              ...   
741  ...              NaN              NaN              NaN              NaN   
742  ...              NaN              NaN              NaN              NaN   
743  ...              NaN              NaN              NaN              NaN   
744  ...              NaN              NaN              NaN              NaN   
745  ...              NaN              NaN              NaN              NaN   

     feat_esm1b_1274  feat_esm1b_1275  feat_esm1b_1276  feat_esm1b_1277  \
0                NaN              NaN              NaN              NaN   
1                NaN              NaN              NaN              NaN   
2                NaN              NaN              NaN              NaN   
3                NaN              NaN              NaN              NaN   
4                NaN              NaN              NaN              NaN   
..               ...              ...              ...              ...   
741              NaN              NaN              NaN              NaN   
742              NaN              NaN              NaN              NaN   
743              NaN              NaN              NaN              NaN   
744              NaN              NaN              NaN              NaN   
745              NaN              NaN              NaN              NaN   

     feat_esm1b_1278  feat_esm1b_1279  
0                NaN              NaN  
1                NaN              NaN  
2                NaN              NaN  
3                NaN              NaN  
4                NaN              NaN  
..               ...              ...  
741              NaN              NaN  
742              NaN              NaN  
743              NaN              NaN  
744              NaN              NaN  
745              NaN              NaN  

[746 rows x 1280 columns]


def showTukeyOutliersColumns(data, factor=1.5):
    # Calculate Q1, Q3, and IQR for each column
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define the upper and lower bounds
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    # Identify outliers
    outliers = ((data < lower_bound) | (data > upper_bound)).any(axis=0)

    # Get the column names of the outliers
    outlier_columns = data.columns[outliers]

    return outlier_columns

# Apply the Tukey method and get the cleaned dataset and the column names of the removed outliers
outlier_columns = showTukeyOutliersColumns(EDA)


print("Outlier columns:")
print(outlier_columns)

Outlier columns:
Index(['feat_esm1b_0', 'feat_esm1b_1', 'feat_esm1b_2', 'feat_esm1b_3',
       'feat_esm1b_4', 'feat_esm1b_5', 'feat_esm1b_6', 'feat_esm1b_7',
       'feat_esm1b_8', 'feat_esm1b_9',
       ...
       'feat_esm1b_1270', 'feat_esm1b_1271', 'feat_esm1b_1272',
       'feat_esm1b_1273', 'feat_esm1b_1274', 'feat_esm1b_1275',
       'feat_esm1b_1276', 'feat_esm1b_1277', 'feat_esm1b_1278',
       'feat_esm1b_1279'],
      dtype='object', length=1226)


def plot_min_max(dataset):
    maxima = dataset.max().values
    minima = dataset.min().values
    # Plot the maxima and minima vectors against each other
    plt.scatter(minima, maxima)
    plt.plot(minima, maxima, color="red")
    plt.title("Plot of Maximum against Minimum")
    plt.xlabel("Minima")
    plt.ylabel("Maxima")
    plt.show()
    
plot_min_max(EDA)


def plot_range_histogram(dataset):
    # Calculate the ranges of each variable
    range = dataset.max() - dataset.min()
    plt.hist(range, bins=20)
    plt.title("Range Histogram")
    plt.xlabel("Range")
    plt.ylabel("Frequency")
    plt.show()
plot_range_histogram(EDA)


def cappingOutliers(data):
    # IQR-based capping
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return  data.clip(lower_bound, upper_bound, axis=1)

EDA = cappingOutliers(EDA)
EDA
outlier_columns = showTukeyOutliersColumns(EDA)
outlier_columns

Index([], dtype='object')


def normalise(data):
    # Create a StandardScaler object
    scaler = StandardScaler()

    # Fit the scaler to the independent variables
    scaler.fit(data)

    # Transform the independent variables
    EDA = scaler.transform(data)
    return EDA
normalise(EDA)

array([[-1.87662701, -1.02946734,  2.48825768, ...,  0.89366964,
         0.90532722, -0.39241543],
       [ 0.11252744,  0.23429552,  1.35232255, ...,  2.19151373,
         0.82103959,  0.0353679 ],
       [-0.79205547,  0.33016573, -0.33479896, ...,  1.34792475,
         1.20848352,  0.64274961],
       ...,
       [ 0.07624328, -2.02741323,  0.04729207, ..., -1.09411658,
        -0.23178897, -0.15654018],
       [-1.17895908, -2.87434743,  0.26058779, ..., -1.43190973,
         0.42849075, -1.00917383],
       [-0.67684839, -0.98056221,  0.08339662, ..., -1.43619195,
         1.59116302, -1.21257136]])


def featureReduction(data):
    # Feature reduction
    pca = PCA(30)
    EDA = pca.fit_transform(data)
    return EDA
    
EDA = featureReduction(EDA)
EDA

array([[-1.92162579, -0.23609598, -0.0310398 , ...,  0.07472657,
        -0.32679189, -0.34665208],
       [-2.06460022, -0.242356  ,  0.0436501 , ...,  0.08977408,
        -0.30075475, -0.6677496 ],
       [-2.34729446, -1.52472625, -0.07931271, ..., -0.07506423,
        -0.42556908, -0.44502854],
       ...,
       [ 1.16269808,  0.33138486, -1.0184451 , ..., -0.31976703,
        -0.40624114, -0.09973824],
       [ 1.69569303, -0.20296697, -0.36221178, ...,  0.15658449,
        -0.59622137, -0.47984221],
       [ 1.93742367,  0.84105617,  1.47378199, ...,  0.15948991,
        -0.47720172, -0.43906906]])


def model(EDA, Class, cluster):
    classifiers = [
        ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('XGBoost', XGBClassifier(n_estimators=100, random_state=42)),
        ('KNN', KNeighborsClassifier()),
        ('SVM', SVC(random_state=42, probability=True)), 
        ('LightGBM', LGBMClassifier(random_state=42)),  
        ('Logistic Regression', LogisticRegression(random_state=42)) 
    ]

    n_splits = 5
    sgkf = StratifiedGroupKFold(n_splits=n_splits)
    groups = cluster

    auc_scores = []

    # Convert pandas Series to NumPy arrays
    split1_features_np = np.array(EDA)
    split1_class_np = np.array(Class)

    # Replace -1 with 0 in the class labels
    split1_class_np[split1_class_np == -1] = 0

    for classifier_name, classifier in classifiers:
        clf_auc_scores = []
        for fold, (train_indices, test_indices) in enumerate(sgkf.split(split1_features_np, split1_class_np, groups)):
            X_train, y_train = split1_features_np[train_indices], split1_class_np[train_indices]
            X_test, y_test = split1_features_np[test_indices], split1_class_np[test_indices]

            # Check if there is only one class present in y_test
            if len(np.unique(y_test)) == 1:
                continue

            # Resampling the dataset
            smote = SMOTE(random_state=42)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

            # Create a pipeline with the classifier
            pipeline = Pipeline([
                (classifier_name, classifier)
            ])

            # Train and evaluate the classifier on this fold
            pipeline.fit(X_train_resampled, y_train_resampled)
            y_prob = pipeline.predict_proba(X_test)[:, 1]
            auc_score = roc_auc_score(y_test, y_prob)
            clf_auc_scores.append(auc_score)

        # Compute the average AUC score for the classifier
        avg_auc_score = np.mean(clf_auc_scores)
        auc_scores.append(avg_auc_score)
        print(f"Average AUC score for {classifier_name}: {avg_auc_score}")


get_info_cluster(full_dataset,'Info_cluster', 'Class')


# Load the dataset training level2
dataset2 = load_data("df_training_level2.csv")
dataset2.shape

(4946, 1294)


table = get_info_cluster(dataset2,'Info_cluster', 'Class')
table


print("Missing values Columns\n",dataset2.isnull().sum())
print("Total missing values: ",dataset2.isnull().sum().sum())

Missing values Columns
 Info_PepID            0
Info_organism_id      0
Info_protein_id       0
Info_pos              0
Info_AA               0
                   ... 
feat_esm1b_1275     521
feat_esm1b_1276     521
feat_esm1b_1277     521
feat_esm1b_1278     521
feat_esm1b_1279     521
Length: 1294, dtype: int64
Total missing values:  666880


def dropMissingValue(dataset):
    # Define the threshold for the number of missing values
    threshold = 0.5
    # columns with missing values above the threshold
    missing_values_count = dataset.isnull().sum()
    columns_to_drop = missing_values_count[missing_values_count/dataset.shape[0] > threshold].index

    # rows with missing values above the threshold
    missing_values_count = dataset.isnull().sum(axis=1)
    rows_to_drop = missing_values_count[missing_values_count/dataset.shape[1] > threshold].index
    
    # Drop columns with missing values above the threshold
    dataset.drop(columns_to_drop, axis=1, inplace=True)

    # Drop rows with missing values above the threshold
    dataset.drop(rows_to_drop, axis=0, inplace=True)
dropMissingValue(dataset2)
print(dataset2.isnull().sum().sum())

0


def split_dataset(dataset):

    # create a set of unique info_clusters in the dataset
    unique_clusters = set(dataset['Info_cluster'])

    # determine the size of each split
    n_clusters = len(unique_clusters)
    split_sizes = [int(n_clusters*0.6), int(n_clusters*0.3), int(n_clusters*0.1)]

    # create empty lists to hold the clusters for each split
    splits = [[] for i in range(3)]

    # loop through the unique clusters and add them to the appropriate split
    for cluster in unique_clusters:
        # determine which split this cluster should go in
        if len(splits[0]) < split_sizes[0]:
            split_idx = 0
        elif len(splits[1]) < split_sizes[1]:
            split_idx = 1
        else:
            split_idx = 2
        # add the cluster to the appropriate split
        splits[split_idx].append(cluster)

    # now use the split info_clusters to create three dataframes
    split1 = dataset[dataset['Info_cluster'].isin(splits[0])]
    split2 = dataset[dataset['Info_cluster'].isin(splits[1])]
    split3 = dataset[dataset['Info_cluster'].isin(splits[2])]

    return split1, split2, split3


split1, split2, split3 = split_dataset(dataset2)


def infoFeatureClass(split1):
    split1_info = split1.iloc[:,12]
    split1_features = split1.iloc[:,14:]
    split1_class = split1.iloc[:,13]
    return split1_info,split1_features,split1_class

split1_info, split1_features, split1_class = infoFeatureClass(split1)

print("split1_features shape:", split1_features.shape)
print("split1_class shape:", split1_class.shape)
print("split1_info shape:", split1_info.shape)

split1_features shape: (3233, 1280)
split1_class shape: (3233,)
split1_info shape: (3233,)


def missingValuesMean(dataset):
    # Define the imputer object
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

    # Apply the imputer to the remaining missing values
    imputer.fit(dataset)
    dataset_imputed = imputer.transform(dataset)
    # Convert the NumPy array to a Pandas DataFrame
    dataset_imputed = pd.DataFrame(dataset_imputed, columns=dataset.columns)

    return dataset_imputed

split1_features = missingValuesMean(split1_features)
split1_features.isnull().sum().sum()

0


# skewness features
high_skewness_features = skewness(split1_features)

# Print the names of the features with high skewness values
print("Features with high skewness:")
print(high_skewness_features)

Features with high skewness:
Index(['feat_esm1b_450'], dtype='object')


# Apply the IQR method to display the outliers
outliers = display_outliers_iqr(split1_features)

print("Outliers:")
print(outliers)

Outliers:
      feat_esm1b_0  feat_esm1b_1  feat_esm1b_2  feat_esm1b_3  feat_esm1b_4  \
0              NaN           NaN           NaN           NaN           NaN   
1              NaN           NaN           NaN           NaN           NaN   
2              NaN           NaN           NaN           NaN           NaN   
3              NaN           NaN           NaN           NaN           NaN   
4              NaN           NaN           NaN           NaN           NaN   
...            ...           ...           ...           ...           ...   
3228           NaN           NaN           NaN           NaN           NaN   
3229           NaN           NaN           NaN           NaN           NaN   
3230           NaN           NaN           NaN           NaN           NaN   
3231           NaN           NaN           NaN           NaN           NaN   
3232           NaN           NaN           NaN           NaN           NaN   

      feat_esm1b_5  feat_esm1b_6  feat_esm1b_7  feat_esm1b_8  feat_esm1b_9  \
0              NaN           NaN           NaN           NaN           NaN   
1              NaN           NaN           NaN           NaN           NaN   
2              NaN           NaN           NaN           NaN           NaN   
3              NaN           NaN           NaN           NaN           NaN   
4              NaN           NaN           NaN           NaN           NaN   
...            ...           ...           ...           ...           ...   
3228           NaN           NaN           NaN           NaN           NaN   
3229           NaN           NaN           NaN           NaN           NaN   
3230           NaN           NaN           NaN           NaN           NaN   
3231           NaN           NaN           NaN           NaN           NaN   
3232           NaN           NaN           NaN           NaN           NaN   

      ...  feat_esm1b_1270  feat_esm1b_1271  feat_esm1b_1272  feat_esm1b_1273  \
0     ...              NaN              NaN              NaN              NaN   
1     ...              NaN              NaN              NaN              NaN   
2     ...              NaN              NaN              NaN              NaN   
3     ...              NaN              NaN              NaN              NaN   
4     ...              NaN              NaN              NaN              NaN   
...   ...              ...              ...              ...              ...   
3228  ...              NaN              NaN              NaN              NaN   
3229  ...              NaN              NaN              NaN              NaN   
3230  ...              NaN              NaN              NaN              NaN   
3231  ...              NaN              NaN              NaN              NaN   
3232  ...              NaN              NaN              NaN              NaN   

      feat_esm1b_1274  feat_esm1b_1275  feat_esm1b_1276  feat_esm1b_1277  \
0                 NaN              NaN              NaN              NaN   
1                 NaN              NaN        -0.507386              NaN   
2                 NaN              NaN              NaN              NaN   
3                 NaN              NaN              NaN              NaN   
4                 NaN              NaN              NaN              NaN   
...               ...              ...              ...              ...   
3228              NaN              NaN              NaN              NaN   
3229              NaN              NaN              NaN              NaN   
3230              NaN              NaN              NaN              NaN   
3231              NaN              NaN              NaN              NaN   
3232              NaN              NaN              NaN              NaN   

      feat_esm1b_1278  feat_esm1b_1279  
0                 NaN              NaN  
1                 NaN              NaN  
2                 NaN              NaN  
3                 NaN              NaN  
4                 NaN              NaN  
...               ...              ...  
3228              NaN              NaN  
3229              NaN              NaN  
3230              NaN              NaN  
3231              NaN              NaN  
3232              NaN              NaN  

[3233 rows x 1280 columns]


outlier_columns = showTukeyOutliersColumns(split1_features)

print("Outlier columns:")
print(outlier_columns)

Outlier columns:
Index(['feat_esm1b_0', 'feat_esm1b_1', 'feat_esm1b_2', 'feat_esm1b_3',
       'feat_esm1b_4', 'feat_esm1b_5', 'feat_esm1b_6', 'feat_esm1b_7',
       'feat_esm1b_8', 'feat_esm1b_9',
       ...
       'feat_esm1b_1270', 'feat_esm1b_1271', 'feat_esm1b_1272',
       'feat_esm1b_1273', 'feat_esm1b_1274', 'feat_esm1b_1275',
       'feat_esm1b_1276', 'feat_esm1b_1277', 'feat_esm1b_1278',
       'feat_esm1b_1279'],
      dtype='object', length=1278)


split1_features = cappingOutliers(split1_features)
split1_features
outlier_columns = showTukeyOutliersColumns(split1_features)
outlier_columns

Index([], dtype='object')


split1_features = normalise(split1_features)


print("split1_features shape:", split1_features.shape)
print("split1_class shape:", split1_class.shape)
print("split1_info shape:", split1_info.shape)

split1_features shape: (3233, 1280)
split1_class shape: (3233,)
split1_info shape: (3233,)


split1_features = featureReduction(split1_features)


print("split1_features shape:", split1_features.shape)
print("split1_class shape:", split1_class.shape)
print("split1_info shape:", split1_info.shape)

split1_features shape: (3233, 30)
split1_class shape: (3233,)
split1_info shape: (3233,)


split1_class.value_counts()

-1    2791
 1     442
Name: Class, dtype: int64


class_distribution(split1)


model(split1_features, split1_class, split1_info)

Average AUC score for Random Forest: 0.5743310399336311
Average AUC score for XGBoost: 0.5590643503548275
Average AUC score for KNN: 0.6229216971638268
Average AUC score for SVM: 0.6208648738316385
Average AUC score for LightGBM: 0.6056957129830778
Average AUC score for Logistic Regression: 0.6471612801039476


split2_info, split2_features, split2_class = infoFeatureClass(split2)
missingValuesMean(split2_features)
split2_features = cappingOutliers(split2_features)
split2_features = normalise(split2_features)
split2_features = featureReduction(split2_features)


def hyperparameter(split2_info, split2_features, split2_class):
    # Second split
    split2_info = split2_info
    split2_features = split2_features
    split2_class = split2_class.values

    classifiers = [
        ('Random Forest', RandomForestClassifier(random_state=42)),
        ('XGBoost', XGBClassifier(random_state=42)),
        ('KNN', KNeighborsClassifier()),
        ('SVM', SVC(random_state=42, probability=True)),
        ('LightGBM', LGBMClassifier(random_state=42)),
        ('Logistic Regression', LogisticRegression(random_state=42))
    ]

    groups = split2_info

    # Set n_splits equal to the number of unique groups
    n_splits = len(np.unique(groups))
    gkf = GroupKFold(n_splits=n_splits)

    
    # Replace -1 with 0 in the class labels
    split2_class[split2_class == -1] = 0

    # Define hyperparameter grids for each classifier
    param_grids = {
        'Random Forest': {'Random Forest__n_estimators': [50, 100, 150],
                          'Random Forest__max_depth': [10, 20, 30],
                          'Random Forest__min_samples_split': [2, 5, 10]},
        'XGBoost': {'XGBoost__n_estimators': [50, 100, 150],
                    'XGBoost__max_depth': [3, 6, 9],
                    'XGBoost__learning_rate': [0.01, 0.1, 1.0]},
        'KNN': {'KNN__n_neighbors': [3, 5, 7],
                'KNN__weights': ['uniform', 'distance']},
        'SVM': {'SVM__C': [0.1, 1, 10],  # Added hyperparameters for Support Vector Machine
                'SVM__kernel': ['linear', 'rbf']},
        'LightGBM': {'LightGBM__n_estimators': [50, 100, 150],  # Added hyperparameters for LightGBM
                     'LightGBM__max_depth': [3, 6, 9],
                     'LightGBM__learning_rate': [0.01, 0.1, 1.0]},
        'Logistic Regression': {'Logistic Regression__C': [0.1, 1, 10],  # Added hyperparameters for Logistic Regression
                                'Logistic Regression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
    }


    for classifier_name, classifier in classifiers:
        
        for fold, (train_indices, test_indices) in enumerate(gkf.split(split2_features, split2_class, groups)):
            X_train, y_train = split2_features[train_indices], split2_class[train_indices]
            X_test, y_test = split2_features[test_indices], split2_class[test_indices]

            # Check if there is only one class present in y_test
            if len(np.unique(y_test)) == 1:
                continue

            # Resampling the dataset
            smote = SMOTE(random_state=42)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

            # Create a pipeline with the classifier
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                (classifier_name, classifier)
            ])


            # Create a GridSearchCV object for hyperparameter tuning
            grid_search = GridSearchCV(pipeline, param_grid=param_grids[classifier_name], scoring='roc_auc', cv=n_splits)

            # Train and evaluate the classifier on this fold
            grid_search.fit(X_train_resampled, y_train_resampled)
            y_prob = grid_search.predict_proba(X_test)[:, 1]

        

        # Print the best hyperparameters for the classifier
        print(f"Best hyperparameters for {classifier_name}: {grid_search.best_params_}")

    
    
    
hyperparameter(split2_info, split2_features, split2_class)

Best hyperparameters for Random Forest: {'Random Forest__max_depth': 10, 'Random Forest__min_samples_split': 2, 'Random Forest__n_estimators': 100}
Best hyperparameters for XGBoost: {'XGBoost__learning_rate': 0.1, 'XGBoost__max_depth': 9, 'XGBoost__n_estimators': 150}
Best hyperparameters for KNN: {'KNN__n_neighbors': 7, 'KNN__weights': 'distance'}
Best hyperparameters for SVM: {'SVM__C': 10, 'SVM__kernel': 'rbf'}
Best hyperparameters for LightGBM: {'LightGBM__learning_rate': 0.1, 'LightGBM__max_depth': 9, 'LightGBM__n_estimators': 150}
Best hyperparameters for Logistic Regression: {'Logistic Regression__C': 0.1, 'Logistic Regression__solver': 'saga'}


def finalValidation(split2, split3):
    # Preprocessing split2 and split3
    split2_info, split2_features, split2_class = infoFeatureClass(split2)
    split3_info, split3_features, split3_class = infoFeatureClass(split3)

    missingValuesMean(split2_features)
    split2_features = cappingOutliers(split2_features)
    split2_features = normalise(split2_features)

    missingValuesMean(split3_features)
    split3_features = cappingOutliers(split3_features)
    split3_features = normalise(split3_features)

    # Make sure you perform the same feature reduction on split2 and split3
    split2_features = featureReduction(split2_features)
    split3_features = featureReduction(split3_features)

    # Combine split2 and split3 into a single dataset
    combined_features = np.concatenate((split2_features, split3_features), axis=0)
    combined_class = np.concatenate((split2_class, split3_class), axis=0)
    combined_info = np.concatenate((split2_info, split3_info), axis=0)

    # Define the GroupKFold splits
    group_kfold = GroupKFold(n_splits=5)
    group_kfold.get_n_splits(combined_features, combined_class, combined_info)

    # Initialize the AUC score list
    auc_scores = []

    # Loop over the splits
    for train_index, test_index in group_kfold.split(combined_features, combined_class, combined_info):
        # Split the data into training and test sets
        X_train, X_test = combined_features[train_index], combined_features[test_index]
        y_train, y_test = combined_class[train_index], combined_class[test_index]

        # Train the Logistic Regression model with the best hyperparameters
        best_params_logisticRegressor = {'C': 0.1, 'solver': 'newton-cg'}
        model_logisticRegressor = LogisticRegression(**best_params_logisticRegressor)
        model_logisticRegressor.fit(X_train, y_train)

        # Make predictions and compute the AUC score for the current split
        predictions = model_logisticRegressor.predict_proba(X_test)

        # Check if there are at least two unique classes in y_test
        unique_classes = np.unique(y_test)
        if len(unique_classes) >= 2:
            auc_score = roc_auc_score(y_test, predictions[:, 1])
            auc_scores.append(auc_score)

    # Calculate the mean AUC score
    mean_auc_score = np.mean(auc_scores)
    print("Mean AUC score for the Logistic Regression model:", mean_auc_score)
    
    # Return the trained model object
    return model_logisticRegressor
    
# Train the Logistic Regression model
model = finalValidation(split2, split3)

Mean AUC score for the Logistic Regression model: 0.5534771627480454


def predict_class(dataset_name, model):
    # Load the dataset
    dataset = pd.read_csv(dataset_name)

    # Extract the features from the dataset
    features = dataset.iloc[:, 13:]

    # Preprocess the features
    missingValuesMean(features)
    features = cappingOutliers(features)
    features = normalise(features)
    features = featureReduction(features)
# 
    # Use the model to predict the class of each instance
    predicted_classes = model.predict(features)

    # Add the predicted classes as a new column in the dataset
    dataset['predicted_class'] = predicted_classes

    # Save the updated dataset to a CSV file
    dataset.to_csv('predictions.csv', index=False)

    # Print the predicted classes
    print("Predicted classes for all instances:", predicted_classes)
# Use the trained model to make predictions
predict_class('df_holdout.csv', model)

Predicted classes for all instances: [1 1 1 ... 1 1 1]

	Info_PepID	Info_organism_id	Info_protein_id	Info_pos	Info_AA	Info_pubmed_id	Info_epitope_id	Info_host_id	Info_nPos	...	feat_esm1b_1270	feat_esm1b_1271	feat_esm1b_1272	feat_esm1b_1273	feat_esm1b_1274	feat_esm1b_1275	feat_esm1b_1276	feat_esm1b_1277	feat_esm1b_1278	feat_esm1b_1279
0	CAA51871.1:2	12161	CAA51871.1	685	S	11458006	60725	10000000	2	...	0.178513	-0.257270	-0.153925	0.014767	-1.294921	-0.112832	0.260342	0.123651	0.159365	0.172829
1	CAA51871.1:2	12161	CAA51871.1	686	R	11458006	60725	10000000	2	...	0.539347	-0.173580	-0.122266	0.235858	-1.230598	-0.060592	0.160817	0.310983	0.146951	0.240393
2	CAA51871.1:2	12161	CAA51871.1	687	L	11458006	60725	10000000	2	...	0.224537	-0.165938	-0.125078	0.131652	-1.359426	0.020718	0.160984	0.189219	0.204018	0.336321
3	CAA51871.1:2	12161	CAA51871.1	688	L	11458006	60725	10000000	2	...	0.173186	-0.069608	-0.133053	0.043285	-1.559416	-0.032758	0.099643	0.117604	0.112384	0.367813
4	CAA51871.1:2	12161	CAA51871.1	689	E	11458006	60725	10000000	2	...	0.136331	-0.068715	0.032138	0.099051	-1.643639	-0.199724	0.076023	-0.128873	0.127291	0.278798

	Info_organism_id	Info_pos	Info_cluster	Class	feat_esm1b_0	feat_esm1b_1	feat_esm1b_2	feat_esm1b_3	feat_esm1b_4	feat_esm1b_5	...	feat_esm1b_1270	feat_esm1b_1271	feat_esm1b_1272	feat_esm1b_1273	feat_esm1b_1274	feat_esm1b_1275	feat_esm1b_1276	feat_esm1b_1277	feat_esm1b_1278	feat_esm1b_1279
count	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	...	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000	746.000000
mean	14617.840483	123.189008	222.260054	0.225201	0.081334	0.155190	0.045183	0.093532	-0.187883	-0.079048	...	0.082458	-0.026580	-0.052922	0.130508	-0.912585	-0.068948	0.098828	-0.005770	0.026254	0.234138
std	7744.949567	248.480251	36.833483	0.974966	0.134752	0.135894	0.153447	0.126711	0.150018	0.171420	...	0.149778	0.133427	0.138832	0.141839	0.326053	0.151345	0.151581	0.145622	0.148122	0.159960
min	12161.000000	1.000000	199.000000	-1.000000	-0.605968	-0.368321	-0.453714	-0.270026	-0.623107	-0.587313	...	-0.252364	-0.426012	-0.438437	-0.298351	-1.749543	-0.402438	-0.344357	-0.456177	-0.401215	-0.326890
25%	12242.000000	49.000000	204.000000	-1.000000	-0.005342	0.056017	-0.045173	0.007387	-0.283482	-0.200325	...	-0.025213	-0.118891	-0.141919	0.036957	-1.142586	-0.176104	0.003478	-0.091283	-0.083311	0.135777
50%	12242.000000	83.000000	204.000000	1.000000	0.093366	0.159134	0.050690	0.102058	-0.190992	-0.087125	...	0.070693	-0.026712	-0.052635	0.129109	-0.929380	-0.082213	0.099465	0.008209	0.022796	0.249037
75%	12242.000000	123.000000	215.000000	1.000000	0.177607	0.248278	0.140758	0.177299	-0.090165	0.035861	...	0.179583	0.066112	0.049650	0.223102	-0.693119	0.009155	0.192338	0.099157	0.125548	0.340555
max	55951.000000	2498.000000	320.000000	1.000000	0.431492	0.533593	0.464730	0.484326	0.291663	0.476413	...	0.546722	0.349790	0.337160	0.576059	0.692667	0.447825	0.676943	0.343372	0.521267	0.624794

	Info_cluster	1	-1
0	34	1505	128
1	35	1513	110
2	36	183	61
3	39	33	0
4	150	283	281
5	198	12	31
6	199	0	30
7	204	289	227
8	215	0	20
9	222	0	15
10	229	0	54
11	232	0	45
12	256	0	7
13	264	0	8
14	298	0	69
15	320	0	42

Linear B-cell epitopes Prediction¶

ABSTRACT¶

INTRODUCTION¶

Dataset¶

Requirement¶

AIm¶

Packages used in this project can be grouped based on their purpose as follows:¶

Loading Dataset:¶

Data Visualisation:¶

Dimensionality Reduction:¶

Data Preprocessing:¶

Handling Imbalanced Data:¶

Modelling:¶

Model Evaluation:¶

Model Selection and Validation:¶

Creating Pipelines:¶

Importing all the libraries¶

Exploratory Data Analysis and Pre-processing¶

Missing Values¶

Class Imbalance¶

Outliers¶

t-SNE plot¶

Skewness¶

IQR¶

Tukey Method¶

Normalisation¶

Scatter plot of the minimum values against the maximum values¶

Histogram Ranges¶

Data Preprocessing on Level1¶

Removing outliers from the dataset¶

Z-score or IQR-based capping¶

Split the dataset into dependent and independent variables¶

Normalisation¶

Feature Reduction¶

Model Validation, Class Balancing and Modelling¶

GroupKFold¶

Models¶

AUC¶

Exploratory Data Analysis (EDA) Level2¶

Merge EDA and dataset2 for Preprocessing¶

Check for Data Distribution across Info_cluster¶

Missing Values¶

Outliers¶

Skewness¶

IQR¶

Tukey¶

Capping the Outliers¶

Normalisation¶

Feature scaling¶

DImension Reduction¶

Data Balancing¶

Model Validation, Class Balancing and Modelling¶

Hyperparameter Tuning¶

Preprocessing on the split2¶

Final Validation¶

Final Pipeline to predict the holdout.csv¶

Conclusion¶

References¶