Installs and Imports¶

In [8]:
import re
import string
import subprocess
import sys
import warnings

warnings.filterwarnings('ignore')

REQS = [
    ('pip', 'pip==24.2'),
    ('lightgbm', 'lightgbm==4.5.0'),
    ('matplotlib', 'matplotlib==3.9.2'),
    ('mlxtend', 'mlxtend==0.23.1'),
    ('nltk', 'nltk==3.9.1'),
    ('numpy', 'numpy==2.0.2'),
    ('optuna', 'optuna==4.0.0'),
    ('pandas', 'pandas==2.2.2'),
    ('seaborn', 'seaborn==0.13.2'),
    ('sklearn', 'scikit-learn==1.5.2'),
    ('statsmodels', 'statsmodels==0.14.3'),
    ('umap-learn', 'umap-learn==0.5.6'),
    ('xgboost', 'xgboost==2.1.1'),
]

try:
    subprocess.check_call([sys.executable, '-m', 'ensurepip'])
except Exception as e:
    print(e, file=sys.stderr)


def ensure_installed(module_info):
    _, install_str = module_info
    try:
        subprocess.check_call([sys.executable, '-m',
                               'pip', 'install', '--quiet',
                               install_str])
        print(f'Installed "{install_str}".')
    except Exception as e:
        print(e, file=sys.stderr)


for m in REQS:
    ensure_installed(m)

# Standard libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# Machine learning and data processing
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    calinski_harabasz_score,
    classification_report,
    confusion_matrix,
    mean_squared_error,
    silhouette_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Statistical modeling
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Dimensionality reduction
import umap

# Hyperparameter optimization
import optuna

# Other machine learning libraries
import lightgbm as lgb
from xgboost import XGBClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


def find_columns_with_missing(data, columns):
    """Finding features that have a lot of missing data"""
    print()
    print('Finding columns with missing data...')
    data_cleaned = data
    missing = []
    i = 0
    for col in columns:
        missing.append(data[col].isnull().sum())
        if missing[i] > 0:
            print()
            print(f'Column {col} is missing {missing[i]} values.')
            print(f'Proportion of missing data is {missing[i]/len(data)}.')
            if missing[i]/len(data) >= 0.9:
                print(f'Dropping column {col}...')
                data_cleaned = data_cleaned.drop(columns=col)
        i += 1
    return missing, data_cleaned


def hex_to_rgb(hex_color):
    """Function to convert hex to RGB"""
    # Remove the '#' if it exists
    hex_color = hex_color.lstrip('#')

    # Convert hex to integer and split into RGB components
    return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]


def preprocess_text(text):
    """Preprocessing function"""
    text = text.lower()
    # Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removes punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    return ' '.join(tokens)


def plot_silhouette_bar_across_experiments(model_names, silhouette_scores):
    n_experiments = len(silhouette_scores)
    n_models = len(model_names)
    bar_width = 0.2
    index = np.arange(n_experiments)
    plt.figure(figsize=(12, 6))

    for i, model_name in enumerate(model_names):
        sil_scores = [exp_scores[i] for exp_scores in silhouette_scores]
        plt.bar(index + i * bar_width,sil_scores, bar_width, label=model_name)

    plt.xlabel('Experiments')
    plt.ylabel('Silhouette scores')
    plt.title('Silhouette scores Across Models and Experiments')
    plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
    plt.legend()
    plt.tight_layout()
    plt.show()


def visualize_ch_index_across_experiments(model_names, ch_scores):

    n_experiments = len(ch_scores)
    n_models = len(model_names)
    bar_width = 0.2
    index = np.arange(n_experiments)
    plt.figure(figsize=(12, 6))

    for i, model_name in enumerate(model_names):
        ch_score = [exp_scores[i] for exp_scores in ch_scores]
        plt.bar(index + i * bar_width, ch_score, bar_width, label=model_name)

    plt.xlabel('Experiments')
    plt.ylabel('Calinski-Harabasz Index')
    plt.title('Calinski-Harabasz Index Across Models and Experiments')
    plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
    plt.legend()
    plt.tight_layout()
    plt.show()


class KMeansClustering:
    def __init__(self, data):
        self.data = data
        self.best_params = None
        self.kmeans_model = None

    def tune_hyperparameters(self, n_trials=15):
        def objective_kmeans(trial):
            n_clusters = trial.suggest_int('n_clusters', 2, 10)
            init_method = trial.suggest_categorical('init', ['k-means++', 'random'])

            kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=42)
            kmeans.fit(self.data)
            labels = kmeans.labels_
            score = silhouette_score(self.data, labels)
            return score

        study = optuna.create_study(direction="maximize")
        study.optimize(objective_kmeans, n_trials=n_trials)
        self.best_params = study.best_params
        print("Best params:", self.best_params)

    def fit_model(self):
        self.kmeans_model = KMeans(n_clusters=self.best_params['n_clusters'],
                                   init=self.best_params['init'],
                                   random_state=42)
        self.kmeans_model.fit(self.data)

    def visualize_clusters(self, umap_embedding, feature):
        labels = self.kmeans_model.labels_
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')
        # Scatter plot in 3D
        scatter = ax.scatter(
            umap_embedding[:, 0],
            umap_embedding[:, 1],
            umap_embedding[:, 2],
            c=labels,
            cmap='viridis',
            s=30
        )
        # Add labels and title
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')
        ax.set_zlabel('UMAP Dimension 3')
        plt.title(f'3D UMAP of K-Means Clusters on {feature}')
        # Add a color bar for better visual distinction of clusters
        plt.colorbar(scatter)
        # Show the plot
        plt.show()

    def plot_elbow_method(self, k_range=(2, 10)):
        """
        Plot the Elbow Method for choosing the optimal number of clusters
        Args:
        - k_range: tuple, range of cluster numbers to evaluate
        """
        inertia = []
        K = range(k_range[0], k_range[1] + 1)
        for k in K:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(self.data)
            inertia.append(kmeans.inertia_)  # Sum of squared distances to closest cluster center

        plt.figure(figsize=(8, 6))
        plt.plot(K, inertia, 'bo-', markersize=8)
        plt.title('Elbow Method for Optimal K')
        plt.xlabel('Number of clusters')
        plt.ylabel('Inertia (Sum of squared distances)')
        plt.grid(True)
        plt.show()

    def output_label(self):
        return self.kmeans_model.labels_

    def silhoutte(self):
        score = silhouette_score(self.data, self.kmeans_model.labels_)
        print(f'The Silhouette score is {score}')
        return score

    def calinski(self):
        if len(np.unique(self.kmeans_model.labels_)) > 1:  # Only calculate if there are clusters
            score = calinski_harabasz_score(self.data, self.kmeans_model.labels_)
        else:
            score = np.nan  # If only one cluster (or all noise), set to NaN
        print(f'The Callinski index is {score}')
        return score


class DBSCANClustering:
    def __init__(self, data):
        self.data = data
        self.best_params = None
        self.dbscan_model = None

    def tune_hyperparameters(self, n_trials=15):
        def objective_dbscan(trial):
            eps = trial.suggest_float('eps', 0.1, 2.0)
            min_samples = trial.suggest_int('min_samples', 3, 20)

            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            dbscan.fit(self.data)
            labels = dbscan.labels_
            if len(set(labels)) > 1:
                score = silhouette_score(self.data, labels)
            else:
                score = -1
            return score

        study = optuna.create_study(direction="maximize")
        study.optimize(objective_dbscan, n_trials=n_trials)
        self.best_params = study.best_params
        print("Found best params:", self.best_params)

    def fit_model(self):
        self.dbscan_model = DBSCAN(eps=self.best_params['eps'], min_samples=self.best_params['min_samples'])
        self.dbscan_model.fit(self.data)

    def visualize_clusters_and_outliers_3D(self, umap_embedding, feature):
        labels = self.dbscan_model.labels_

        # Separate clustered points and noise points
        clustered_points = umap_embedding[labels >= 0]  # Points part of a cluster
        clustered_labels = labels[labels >= 0]
        outliers = umap_embedding[labels == -1]  # Noise points

        # Create a 3D plot
        fig = plt.figure(figsize=(10, 7))
        ax = fig.add_subplot(111, projection='3d')

        # Plot the clustered points in different colors
        scatter = ax.scatter(clustered_points[:, 0], clustered_points[:, 1], clustered_points[:, 2],
                             c=clustered_labels, cmap='viridis', s=30)

        # Plot the outliers (noise points) in red with 'x' markers
        ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red', marker='x', s=80, label='Outliers')

        # Add labels and title
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')
        ax.set_zlabel('UMAP Dimension 3')
        ax.set_title(f'DBSCAN 3D Clusters with Outliers on {feature}')
        # Add a legend and color bar for clusters
        plt.legend()
        plt.colorbar(scatter, ax=ax)
        plt.show()

    def output_label(self):
        return self.dbscan_model.labels_

    def silhoutte(self):
        score = silhouette_score(self.data, self.dbscan_model.labels_)
        print(f'The Silhouette score is {score}')
        return score

    def calinski(self):
        if len(np.unique(self.dbscan_model.labels_)) > 1:  # Only calculate if there are clusters
            score = calinski_harabasz_score(self.data, self.dbscan_model.labels_)
        else:
            score = np.nan  # If only one cluster (or all noise), set to NaN
        print(f'The Callinski index is {score}')
        return score


class ClusteringDataRetriever:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def get_data_with_labels(self):
        # If Data is in a numpy array, convert it to a pandas DataFrame
        if isinstance(self.data, np.ndarray):
            df = pd.DataFrame(self.data)
        else:
            df = self.data.copy()  # If already a DataFrame

        # Add a new column for the cluster labels
        df['Cluster_Label'] = self.labels

        return df[['gender', 'gender:confidence', 'Cluster_Label']]

    def get_cluster_data(self, cluster_label):
        # Retrieve data points belonging to a specific cluster.
        df = self.get_data_with_labels()
        return df[df['Cluster_Label'] == cluster_label]

    def get_noise_data(self):
        # Retrieve Records classified as noise (-1 label) in DBSCAN.
        return self.get_cluster_data(-1)
Installed "pip==24.2".
Installed "lightgbm==4.5.0".
Installed "matplotlib==3.9.2".
Installed "mlxtend==0.23.1".
Installed "nltk==3.9.1".
Installed "numpy==2.0.2".
Installed "optuna==4.0.0".
Installed "pandas==2.2.2".
Installed "seaborn==0.13.2".
Installed "scikit-learn==1.5.2".
Installed "statsmodels==0.14.3".
Installed "umap-learn==0.5.6".
Installed "xgboost==2.1.1".

EDA¶

In [2]:
# Main starts here
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')

# Quick view of the dataset
print()
print('Dataset Overview')
print(df.info())
print(df.head())

all_features = df.columns

missing_col, df_cleaned = find_columns_with_missing(df, all_features)

# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])

# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])

# Now that we have handled the missing data, you can proceed with further analysis
print()
print('Dataset Overview')
print(df_cleaned.info())
print(df_cleaned.head())

print()
print('---- EXPLORATORY DATA ANALYSIS (EDA) ----')

current_num_features = df.select_dtypes(include=[np.number])

# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
    plt.title(f'Distribution of {feature} by Gender')
    plt.show()

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
    plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
    plt.show()

# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year

# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')

# assuming Data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days

df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']

# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()

# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()

# show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()

# Exploring 'link_color' and 'sidebar_color' features

# Check number of NaN value in  'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()

print()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}.")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}.")

# Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'Number of link color is {link_color_count}.')
print(f'Number of side bar color is {sidebar_color_count}.')

# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')

# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")

# top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
# print(top_sidebar_colors)

# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()

# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()

# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
              order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
              order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
                palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
                palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
       'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
       ]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)

# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']

# categorise types of features

# numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()

# categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)

freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)

# gender features
# encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})

# Check for unique values in the 'gender' column after replacement
print()
print("Unique Values in 'gender'")
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()

# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)

# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))

rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)

# Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])

# Check if all required features are there
print()
print('All Remaining Features')
print(df_preprocessed.columns.tolist())

# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
# print(f'All current numerical features are {numerical_features.columns.tolist()}')

print()
print('Dataset Overview After PreProcessing')
print(df_preprocessed.info())

print()
print('---- NLP Processing ----')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
# df_preprocessed['name'].fillna('', inplace=True)

# Check the text features if they still contain NaN
print()
print(df_preprocessed.select_dtypes(include=[object]))

# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
# df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))

# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())

# Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])

# Initialize TFIDF vectorizer for text features
print()
print('Applying TF-IDF Vectorisation...')
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')

# Apply TF-IDF on 'description', 'text', 'name' columns

tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
# tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()

# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
# tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])

# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)

# Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])

df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)

df_asso = df_preprocessed.copy()

df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  profile_yn_gold        50 non-null     object 
 16  profileimage           20050 non-null  object 
 17  retweet_count          20050 non-null  int64  
 18  sidebar_color          20050 non-null  object 
 19  text                   20050 non-null  object 
 20  tweet_coord            159 non-null    object 
 21  tweet_count            20050 non-null  int64  
 22  tweet_created          20050 non-null  object 
 23  tweet_id               20050 non-null  float64
 24  tweet_location         12565 non-null  object 
 25  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
None
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence profile_yn  profile_yn:confidence  \
0    male             1.0000        yes                    1.0   
1    male             1.0000        yes                    1.0   
2    male             0.6625        yes                    1.0   
3    male             1.0000        yes                    1.0   
4  female             1.0000        yes                    1.0   

          created  ...                                       profileimage  \
0    12/5/13 1:48  ...  https://pbs.twimg.com/profile_images/414342229...   
1   10/1/12 13:51  ...  https://pbs.twimg.com/profile_images/539604221...   
2  11/28/14 11:30  ...  https://pbs.twimg.com/profile_images/657330418...   
3   6/11/09 22:39  ...  https://pbs.twimg.com/profile_images/259703936...   
4   4/16/14 13:23  ...  https://pbs.twimg.com/profile_images/564094871...   

   retweet_count sidebar_color  \
0              0        FFFFFF   
1              0        C0DEED   
2              1        C0DEED   
3              0        C0DEED   
4              0             0   

                                                text tweet_coord tweet_count  \
0  Robbie E Responds To Critics After Win Against...         NaN      110964   
1  ‰ÛÏIt felt like they were my friends and I was...         NaN        7471   
2  i absolutely adore when louis starts the songs...         NaN        5617   
3  Hi @JordanSpieth - Looking at the url - do you...         NaN        1693   
4  Watching Neighbours on Sky+ catching up with t...         NaN       31462   

    tweet_created      tweet_id   tweet_location               user_timezone  
0  10/26/15 12:40  6.587300e+17  main; @Kan1shk3                     Chennai  
1  10/26/15 12:40  6.587300e+17              NaN  Eastern Time (US & Canada)  
2  10/26/15 12:40  6.587300e+17           clcncl                    Belgrade  
3  10/26/15 12:40  6.587300e+17    Palo Alto, CA  Pacific Time (US & Canada)  
4  10/26/15 12:40  6.587300e+17              NaN                         NaN  

[5 rows x 26 columns]

Finding columns with missing data...

Column _last_judgment_at is missing 50 values.
Proportion of missing data is 0.0024937655860349127.

Column gender is missing 97 values.
Proportion of missing data is 0.00483790523690773.

Column gender:confidence is missing 26 values.
Proportion of missing data is 0.0012967581047381546.

Column description is missing 3744 values.
Proportion of missing data is 0.18673316708229426.

Column gender_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column gender_gold...

Column profile_yn_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column profile_yn_gold...

Column tweet_coord is missing 19891 values.
Proportion of missing data is 0.992069825436409.
Dropping column tweet_coord...

Column tweet_location is missing 7485 values.
Proportion of missing data is 0.3733167082294264.

Column user_timezone is missing 7798 values.
Proportion of missing data is 0.388927680798005.

Dataset Overview
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               19953 non-null  int64  
 1   _golden                19953 non-null  bool   
 2   _unit_state            19953 non-null  object 
 3   _trusted_judgments     19953 non-null  int64  
 4   _last_judgment_at      19903 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      19953 non-null  float64
 7   profile_yn:confidence  19953 non-null  float64
 8   created                19953 non-null  object 
 9   description            16224 non-null  object 
 10  fav_number             19953 non-null  int64  
 11  link_color             19953 non-null  object 
 12  name                   19953 non-null  object 
 13  profileimage           19953 non-null  object 
 14  retweet_count          19953 non-null  int64  
 15  sidebar_color          19953 non-null  object 
 16  text                   19953 non-null  object 
 17  tweet_count            19953 non-null  int64  
 18  tweet_created          19953 non-null  object 
 19  tweet_id               19953 non-null  float64
 20  tweet_location         12510 non-null  object 
 21  user_timezone          12185 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
None
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence  profile_yn:confidence         created  \
0    male             1.0000                    1.0    12/5/13 1:48   
1    male             1.0000                    1.0   10/1/12 13:51   
2    male             0.6625                    1.0  11/28/14 11:30   
3    male             1.0000                    1.0   6/11/09 22:39   
4  female             1.0000                    1.0   4/16/14 13:23   

                                         description  ...            name  \
0                              i sing my own rhythm.  ...         sheezy0   
1  I'm the author of novels filled with family dr...  ...     DavdBurnett   
2                louis whining and squealing and all  ...  lwtprettylaugh   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...  ...     douggarland   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...  ...    WilfordGemma   

                                        profileimage retweet_count  \
0  https://pbs.twimg.com/profile_images/414342229...             0   
1  https://pbs.twimg.com/profile_images/539604221...             0   
2  https://pbs.twimg.com/profile_images/657330418...             1   
3  https://pbs.twimg.com/profile_images/259703936...             0   
4  https://pbs.twimg.com/profile_images/564094871...             0   

  sidebar_color                                               text  \
0        FFFFFF  Robbie E Responds To Critics After Win Against...   
1        C0DEED  ‰ÛÏIt felt like they were my friends and I was...   
2        C0DEED  i absolutely adore when louis starts the songs...   
3        C0DEED  Hi @JordanSpieth - Looking at the url - do you...   
4             0  Watching Neighbours on Sky+ catching up with t...   

  tweet_count   tweet_created      tweet_id   tweet_location  \
0      110964  10/26/15 12:40  6.587300e+17  main; @Kan1shk3   
1        7471  10/26/15 12:40  6.587300e+17              NaN   
2        5617  10/26/15 12:40  6.587300e+17           clcncl   
3        1693  10/26/15 12:40  6.587300e+17    Palo Alto, CA   
4       31462  10/26/15 12:40  6.587300e+17              NaN   

                user_timezone  
0                     Chennai  
1  Eastern Time (US & Canada)  
2                    Belgrade  
3  Pacific Time (US & Canada)  
4                         NaN  

[5 rows x 22 columns]

---- EXPLORATORY DATA ANALYSIS (EDA) ----
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Number of NaN values in 'link_color': 0.
Number of NaN values in 'sidebar_color': 0.
Number of link color is 2986.
Number of side bar color is 559.
Number of NaN values in 'link_color': 0
Number of NaN values in 'sidebar_color': 0
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Unique Values in 'gender'
[0 1 2]
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   link_color              18836 non-null  object 
 5   retweets_per_day        18836 non-null  float64
 6   sidebar_color           18836 non-null  object 
 7   text                    18836 non-null  object 
 8   tweets_per_day          18836 non-null  float64
 9   user_timezone           18836 non-null  object 
 10  tweet_location          18836 non-null  object 
 11  profile_created_year    18836 non-null  int32  
 12  tweet_created_year      18836 non-null  int32  
 13  tweet_location_encoded  18836 non-null  float64
 14  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(6)
memory usage: 2.2+ MB
None
No description has been provided for this image
All Remaining Features
['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded']

Dataset Overview After PreProcessing
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   retweets_per_day        18836 non-null  float64
 5   text                    18836 non-null  object 
 6   tweets_per_day          18836 non-null  float64
 7   profile_created_year    18836 non-null  int32  
 8   tweet_created_year      18836 non-null  int32  
 9   tweet_location_encoded  18836 non-null  float64
 10  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(2)
memory usage: 1.6+ MB
None

---- NLP Processing ----
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
                                             description  \
0                                  i sing my own rhythm.   
1      I'm the author of novels filled with family dr...   
2                    louis whining and squealing and all   
3      Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4      Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   
...                                                  ...   
20045                                               (rp)   
20046  Whatever you like, it's not a problem at all. ...   
20047  #TeamBarcelona ..You look lost so you should f...   
20048  Anti-statist; I homeschool my kids. Aspiring t...   
20049                     Teamwork makes the dream work.   

                                                    text  
0      Robbie E Responds To Critics After Win Against...  
1      ‰ÛÏIt felt like they were my friends and I was...  
2      i absolutely adore when louis starts the songs...  
3      Hi @JordanSpieth - Looking at the url - do you...  
4      Watching Neighbours on Sky+ catching up with t...  
...                                                  ...  
20045  @lookupondeath ...Fine, and I'll drink tea too...  
20046  Greg Hardy you a good player and all but don't...  
20047  You can miss people and still never want to se...  
20048  @bitemyapp i had noticed your tendency to pee ...  
20049  I think for my APUSH creative project I'm goin...  

[18836 rows x 2 columns]
                                         description  \
0                              i sing my own rhythm.   
1  I'm the author of novels filled with family dr...   
2                louis whining and squealing and all   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   

                                 cleaned_description  \
0                                        sing rhythm   
1        im author novel filled family drama romance   
2                            louis whining squealing   
3  mobile guy er shazam google kleiner perkins ya...   
4  ricky wilson best frontmankaiser chief best ba...   

                                                text  \
0  Robbie E Responds To Critics After Win Against...   
1  ‰ÛÏIt felt like they were my friends and I was...   
2  i absolutely adore when louis starts the songs...   
3  Hi @JordanSpieth - Looking at the url - do you...   
4  Watching Neighbours on Sky+ catching up with t...   

                                        cleaned_text  
0  robbie e responds critic win eddie edward worl...  
1  felt like friend living story httpstcoarngeyhn...  
2  absolutely adore louis start song hit hard fee...  
3  hi jordanspieth looking url use ifttt dont typ...  
4    watching neighbour sky catching neighbs xxx xxx  

Applying TF-IDF Vectorisation...

CLUSTERING¶

In [3]:
print()
print()
print('---- CLUSTERING MODELS ----')

print()
print("=" * 50)
print('EXP 1: USING ALL SELECTED FEATURES')
print("=" * 50)

sil_ex1 = []
cal_ex1 = []
# Drop the gender and categorical features before normalise

df_cat = df_cate.copy()
# Drop gender feature and categorical features
df_preprocessed = df_preprocessed.drop(columns=df_cat.columns)
df_finalised = df_preprocessed.drop(columns=['gender', 'gender:confidence'])

# Normalise every existing feature
scaler = StandardScaler()
df_finalised = pd.DataFrame(scaler.fit_transform(df_finalised), columns=df_finalised.columns)

df_finalised = pd.concat([df_finalised, df_cat, df_gender], axis=1)
# find the rows that contained NaN values and drop them
df_finalised = df_finalised.dropna()

data_exp1 = df_finalised
df_ex1 = df_finalised.drop(columns=['gender', 'gender:confidence'])


# Check the preprocessed dataset in the present
print()
print('Dataset for Exp 1')
print(df_ex1.info())
print()

# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_vis = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(df_ex1)
umap_plot = umap_vis.fit_transform(df_ex1)
print(umap_embedding.shape)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding)
kmeans_clustering.tune_hyperparameters()
kmeans_exp1 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_plot, 'All feature types')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex1.append(kmeans_clustering.silhoutte())
cal_ex1.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(data_exp1, k_labels)
df_with_labels = k_retriever.get_data_with_labels()

print()
print('Dataset with Labels from KMeans in Exp 1')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 1')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding)
dbscan_clustering.tune_hyperparameters()
dbscan_exp1 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_plot, 'All feature types')
db_labels = dbscan_clustering.output_label()
sil_ex1.append(dbscan_clustering.silhoutte())
cal_ex1.append(dbscan_clustering.calinski())

# Initialize the class to retrieve data
db_retriever = ClusteringDataRetriever(data_exp1, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 1')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 1')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print("=" * 50)
print('EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES')
print("=" * 50)

sil_ex2 = []
cal_ex2 = []

# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_num.shape[0], chunk_size):
    df_num.iloc[i:i + chunk_size] = scaler.fit_transform(df_num.iloc[i:i + chunk_size])
df_no_text = pd.concat([df_num, df_cate, df_gender], axis=1)
print()
print("Data with Only Numerical and Categorical Features")
print(df_no_text.info())
print()

df_no_text = df_no_text.dropna()
df_no_text_wg = df_no_text.copy()
print('Removing NaN values...')

# Drop gender feature before clustering
data_exp2 = df_no_text.drop(columns=['gender', 'gender:confidence'])
print('Dropping gender and gender:confidence...')

# Check No. of records after drop NaN values
print()
print("Dataset for Exp 2")
print(data_exp2.info())
print()
print(data_exp2.head())

# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(data_exp2)
print(umap_embedding.shape)
# umap_embedding = umap_embedding.astype(np.float32)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(data_exp2)
kmeans_clustering.tune_hyperparameters()
kmeans_exp2 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Numerical and categorical features')  # Visualize clusters
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex2.append(kmeans_clustering.silhoutte())
cal_ex2.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(df_no_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 2')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 2')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(data_exp2)
dbscan_clustering.tune_hyperparameters()  # Tune DBSCAN hyperparameters
dbscan_exp2 = dbscan_clustering.fit_model()  # Fit the DBSCAN model
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'numerical and categorical features')  # Plot 3D noise points and valid clusters
db_labels = dbscan_clustering.output_label()
sil_ex2.append(dbscan_clustering.silhoutte())
cal_ex2.append(dbscan_clustering.calinski())


db_retriever = ClusteringDataRetriever(df_no_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 2')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 2')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print("=" * 50)
print('EXP 3: USING ONLY TEXT FEATURES')
print("=" * 50)

sil_ex3 = []
cal_ex3 = []
# Merge with main dataframe
df_with_text = pd.concat([tfidf_desc_df, tfidf_text_df], axis=1)
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_with_text.shape[0], chunk_size):
    df_with_text.iloc[i:i + chunk_size] = scaler.fit_transform(df_with_text.iloc[i:i + chunk_size])

df_with_text_wg = pd.concat([df_with_text, df_gender], axis=1)
# Drop NaN values before clustering
df_with_text_wg = df_with_text_wg.dropna()
data_exp3 = df_with_text_wg.drop(columns=['gender', 'gender:confidence'])

# Drop the gender features before clustering

print('Dataset for Exp 3')
print(data_exp3.info())
print()
print(data_exp3.head())

print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_embedding_t = umap_model.fit_transform(data_exp3)
umap_embedding = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42).fit_transform(data_exp3)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding_t)
kmeans_clustering.tune_hyperparameters()
kmeans_exp3 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Text features')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex3.append(kmeans_clustering.silhoutte())
cal_ex3.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(df_with_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 3')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 3')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCANClustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding_t)
dbscan_clustering.tune_hyperparameters()
dbscan_exp3 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'Text features')
db_labels = dbscan_clustering.output_label()
sil_ex3.append(dbscan_clustering.silhoutte())
cal_ex3.append(dbscan_clustering.calinski())

db_retriever = ClusteringDataRetriever(df_with_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 3')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 3')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print('---- VISUALIZE THE METRIC EVALUATION ----')

# Metric functions
model_names = ['KMeans', 'DBSCAN']

sil_scores = [sil_ex1, sil_ex2, sil_ex3]
cal_scores = [cal_ex1, cal_ex2, cal_ex3]

plot_silhouette_bar_across_experiments(model_names, sil_scores)
visualize_ch_index_across_experiments(model_names, cal_scores)

---- CLUSTERING MODELS ----

==================================================
EXP 1: USING ALL SELECTED FEATURES
==================================================

Dataset for Exp 1
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3013 entries, favorites_per_day to user_timezone_encoded
dtypes: float64(3013)
memory usage: 407.1 MB
None

Applying UMAP for dim reduction...
[I 2024-09-20 16:20:19,495] A new study created in memory with name: no-name-f656c2a4-43f7-454b-87f6-e1b8bbb5ba19
(17702, 2)

Performing K-Means Clustering...
[I 2024-09-20 16:20:24,756] Trial 0 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:29,547] Trial 1 finished with value: 0.40816256403923035 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:34,470] Trial 2 finished with value: 0.43370768427848816 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:39,242] Trial 3 finished with value: 0.4106582999229431 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:44,060] Trial 4 finished with value: 0.3901534974575043 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:48,864] Trial 5 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:53,940] Trial 6 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:58,788] Trial 7 finished with value: 0.3933861553668976 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:03,521] Trial 8 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:08,516] Trial 9 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:13,893] Trial 10 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:19,213] Trial 11 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:24,493] Trial 12 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:29,872] Trial 13 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:35,112] Trial 14 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
Best params: {'n_clusters': 2, 'init': 'random'}
No description has been provided for this image
No description has been provided for this image
The Silhouette score is 0.7726734280586243
The Callinski index is 20992.505859375

Dataset with Labels from KMeans in Exp 1
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from KMeans in Exp 1
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[16379 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5439
No. of records with gender 1 in cluster 0 is 5773
No. of records with gender 2 in cluster 0 is 5167

Records found in cluster 1 from KMeans in Exp 1
[I 2024-09-20 16:21:42,026] A new study created in memory with name: no-name-ad1593d8-66bc-4c0f-9d74-f56f96710d50
       gender  gender:confidence  Cluster_Label
7         0.0             1.0000              1
33        0.0             1.0000              1
49        2.0             1.0000              1
56        1.0             0.6684              1
58        0.0             1.0000              1
...       ...                ...            ...
18738     2.0             1.0000              1
18753     0.0             0.6678              1
18759     0.0             0.6386              1
18789     0.0             1.0000              1
18803     1.0             1.0000              1

[1323 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 404
No. of records with gender 1 in cluster 1 is 428
No. of records with gender 2 in cluster 1 is 491

Performing DBSCAN Clustering...
[I 2024-09-20 16:21:48,312] Trial 0 finished with value: 0.3155621588230133 and parameters: {'eps': 1.5913067486466435, 'min_samples': 6}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:21:54,152] Trial 1 finished with value: 0.24721910059452057 and parameters: {'eps': 1.0376530894652887, 'min_samples': 18}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:22:00,118] Trial 2 finished with value: 0.2345193773508072 and parameters: {'eps': 1.08924832783019, 'min_samples': 7}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:22:06,672] Trial 3 finished with value: 0.3255881667137146 and parameters: {'eps': 1.9565357155432446, 'min_samples': 4}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:13,131] Trial 4 finished with value: 0.32468611001968384 and parameters: {'eps': 1.9655521749248066, 'min_samples': 17}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:19,013] Trial 5 finished with value: 0.26063308119773865 and parameters: {'eps': 0.9674339846692939, 'min_samples': 14}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:25,335] Trial 6 finished with value: 0.32788148522377014 and parameters: {'eps': 1.7693479090782473, 'min_samples': 9}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:31,052] Trial 7 finished with value: 0.24578818678855896 and parameters: {'eps': 0.7826789736238435, 'min_samples': 19}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:36,540] Trial 8 finished with value: -0.14658115804195404 and parameters: {'eps': 0.34017243144029763, 'min_samples': 4}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:42,106] Trial 9 finished with value: 0.0954396203160286 and parameters: {'eps': 0.490850883967341, 'min_samples': 20}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:48,410] Trial 10 finished with value: 0.24460361897945404 and parameters: {'eps': 1.4333032533727734, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:54,901] Trial 11 finished with value: 0.32556405663490295 and parameters: {'eps': 1.9767937461657843, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:23:01,305] Trial 12 finished with value: 0.33137843012809753 and parameters: {'eps': 1.6198251417047203, 'min_samples': 3}. Best is trial 12 with value: 0.33137843012809753.
[I 2024-09-20 16:23:07,690] Trial 13 finished with value: 0.32246026396751404 and parameters: {'eps': 1.528098496701475, 'min_samples': 13}. Best is trial 12 with value: 0.33137843012809753.
[I 2024-09-20 16:23:14,050] Trial 14 finished with value: 0.3302082121372223 and parameters: {'eps': 1.6778064207338765, 'min_samples': 8}. Best is trial 12 with value: 0.33137843012809753.
Found best params: {'eps': 1.6198251417047203, 'min_samples': 3}
No description has been provided for this image
The Silhouette score is 0.33137843012809753
The Callinski index is 1748.1387939453125

Dataset with Labels from DBSCAN in Exp 1
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[15976 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5308
No. of records with gender 1 in cluster 0 is 5667
No. of records with gender 2 in cluster 0 is 5001

Records found in cluster 1 from DBSCAN in Exp 1
     gender  gender:confidence  Cluster_Label
7       0.0             1.0000              1
33      0.0             1.0000              1
49      2.0             1.0000              1
56      1.0             0.6684              1
58      0.0             1.0000              1
132     1.0             1.0000              1
153     2.0             1.0000              1
191     2.0             0.6804              1
192     0.0             1.0000              1
199     1.0             1.0000              1
231     1.0             1.0000              1
243     0.0             1.0000              1
250     2.0             1.0000              1
288     1.0             0.6494              1
308     1.0             0.6752              1
390     1.0             0.6786              1
460     2.0             0.6708              1
503     0.0             1.0000              1
No. of records with gender 0 in cluster 1 is 6
No. of records with gender 1 in cluster 1 is 7
No. of records with gender 2 in cluster 1 is 5

Records found in cluster 2 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
282      1.0             1.0000              2
2135     2.0             1.0000              2
2929     0.0             1.0000              2
3229     0.0             1.0000              2
3770     0.0             1.0000              2
...      ...                ...            ...
9194     2.0             1.0000              2
9195     1.0             1.0000              2
9220     2.0             1.0000              2
9283     2.0             0.6659              2
9293     0.0             1.0000              2

[180 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 55
No. of records with gender 1 in cluster 2 is 48
No. of records with gender 2 in cluster 2 is 77

Records found in cluster 3 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
302       1.0             1.0000              3
1402      0.0             0.3539              3
2347      2.0             0.6757              3
2964      1.0             1.0000              3
4898      0.0             1.0000              3
5276      2.0             0.6632              3
5379      0.0             1.0000              3
5536      2.0             0.6943              3
5949      1.0             0.6848              3
6017      1.0             0.3486              3
6245      2.0             1.0000              3
6298      0.0             1.0000              3
6374      2.0             1.0000              3
6466      2.0             1.0000              3
6882      0.0             0.6879              3
6904      2.0             0.6842              3
7434      2.0             1.0000              3
7625      0.0             1.0000              3
7662      0.0             1.0000              3
7745      1.0             1.0000              3
7811      2.0             0.6341              3
7910      2.0             1.0000              3
8159      2.0             1.0000              3
8331      2.0             0.6716              3
8340      2.0             0.6707              3
8401      0.0             0.6732              3
8487      0.0             0.6806              3
8489      0.0             1.0000              3
8505      1.0             1.0000              3
8535      2.0             1.0000              3
8583      0.0             1.0000              3
8622      0.0             0.6634              3
8623      2.0             0.6778              3
8647      2.0             1.0000              3
8690      2.0             1.0000              3
8764      2.0             0.6674              3
8784      2.0             1.0000              3
8859      2.0             1.0000              3
8925      0.0             1.0000              3
8930      2.0             1.0000              3
8971      1.0             1.0000              3
9001      1.0             1.0000              3
9055      1.0             1.0000              3
9076      2.0             1.0000              3
9089      1.0             1.0000              3
9118      2.0             0.6712              3
9166      2.0             1.0000              3
9280      1.0             1.0000              3
14662     2.0             1.0000              3
15096     2.0             0.3410              3
15533     1.0             0.6619              3
15979     0.0             1.0000              3
16380     0.0             1.0000              3
16802     2.0             0.3531              3
17226     1.0             1.0000              3
17617     1.0             1.0000              3
18272     0.0             0.6686              3
No. of records with gender 0 in cluster 3 is 16
No. of records with gender 1 in cluster 3 is 14
No. of records with gender 2 in cluster 3 is 27

Records found in cluster 4 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
426       2.0             1.0000              4
432       0.0             1.0000              4
1992      0.0             1.0000              4
2776      0.0             1.0000              4
3755      2.0             1.0000              4
3769      2.0             0.6497              4
3784      2.0             1.0000              4
4418      1.0             1.0000              4
5352      1.0             1.0000              4
9341      2.0             1.0000              4
9379      0.0             1.0000              4
10138     1.0             1.0000              4
10451     0.0             0.6824              4
13349     0.0             1.0000              4
14425     0.0             0.6628              4
14668     2.0             1.0000              4
16449     1.0             1.0000              4
16881     1.0             0.6733              4
No. of records with gender 0 in cluster 4 is 7
No. of records with gender 1 in cluster 4 is 5
No. of records with gender 2 in cluster 4 is 6

Records found in cluster 5 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
431      0.0             0.6631              5
4374     2.0             1.0000              5
4456     1.0             1.0000              5
4653     2.0             1.0000              5
5008     2.0             1.0000              5
5044     2.0             1.0000              5
5220     2.0             0.6650              5
5533     2.0             1.0000              5
5580     0.0             1.0000              5
5596     2.0             1.0000              5
5662     1.0             1.0000              5
5749     2.0             1.0000              5
5988     2.0             1.0000              5
6669     0.0             1.0000              5
7261     0.0             1.0000              5
7702     2.0             0.7012              5
7771     2.0             1.0000              5
7898     2.0             1.0000              5
8120     1.0             1.0000              5
8248     1.0             1.0000              5
8295     2.0             0.6579              5
8360     2.0             0.6854              5
8984     2.0             0.6890              5
9100     0.0             1.0000              5
No. of records with gender 0 in cluster 5 is 5
No. of records with gender 1 in cluster 5 is 4
No. of records with gender 2 in cluster 5 is 15

Records found in cluster 6 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
502      0.0             1.0000              6
578      1.0             1.0000              6
644      0.0             1.0000              6
771      0.0             1.0000              6
963      2.0             1.0000              6
1433     1.0             1.0000              6
1881     0.0             0.6691              6
2762     2.0             0.6670              6
2903     1.0             0.6763              6
3308     0.0             0.3364              6
3353     0.0             1.0000              6
3681     2.0             1.0000              6
3830     0.0             1.0000              6
4305     1.0             1.0000              6
5040     0.0             1.0000              6
5479     0.0             0.6857              6
5742     0.0             1.0000              6
6460     2.0             1.0000              6
6862     1.0             1.0000              6
8397     2.0             0.6634              6
8516     2.0             0.6839              6
8918     2.0             1.0000              6
No. of records with gender 0 in cluster 6 is 10
No. of records with gender 1 in cluster 6 is 5
No. of records with gender 2 in cluster 6 is 7

Records found in cluster 7 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
513       2.0             1.0000              7
514       0.0             1.0000              7
520       0.0             0.3458              7
553       0.0             1.0000              7
554       0.0             0.3431              7
555       0.0             1.0000              7
556       0.0             1.0000              7
557       0.0             1.0000              7
560       1.0             1.0000              7
564       1.0             1.0000              7
565       1.0             1.0000              7
566       2.0             0.6829              7
576       0.0             1.0000              7
577       2.0             1.0000              7
1102      1.0             0.6777              7
2660      0.0             0.3478              7
4100      2.0             1.0000              7
4344      2.0             1.0000              7
4370      0.0             1.0000              7
4426      2.0             0.6838              7
4444      0.0             0.6422              7
4489      1.0             1.0000              7
4643      0.0             1.0000              7
4781      2.0             0.6475              7
4896      2.0             1.0000              7
4950      1.0             1.0000              7
4967      0.0             1.0000              7
5030      0.0             1.0000              7
5176      1.0             1.0000              7
5256      2.0             0.6475              7
5355      0.0             1.0000              7
5356      0.0             1.0000              7
5427      1.0             1.0000              7
5448      2.0             0.6654              7
7995      2.0             1.0000              7
8037      0.0             0.6374              7
8233      0.0             1.0000              7
10824     0.0             1.0000              7
No. of records with gender 0 in cluster 7 is 19
No. of records with gender 1 in cluster 7 is 8
No. of records with gender 2 in cluster 7 is 11

Records found in cluster 8 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
548       2.0             0.6672              8
4512      0.0             1.0000              8
7351      2.0             0.6667              8
7473      1.0             1.0000              8
10589     0.0             0.6623              8
12139     0.0             1.0000              8
12845     0.0             1.0000              8
12988     2.0             0.6557              8
14702     2.0             1.0000              8
17727     0.0             1.0000              8
No. of records with gender 0 in cluster 8 is 5
No. of records with gender 1 in cluster 8 is 1
No. of records with gender 2 in cluster 8 is 4

Records found in cluster 9 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
570       2.0             0.6616              9
3168      1.0             1.0000              9
11317     2.0             1.0000              9
11909     1.0             1.0000              9
14448     0.0             1.0000              9
14613     0.0             1.0000              9
14791     1.0             1.0000              9
15015     1.0             1.0000              9
15216     0.0             1.0000              9
No. of records with gender 0 in cluster 9 is 3
No. of records with gender 1 in cluster 9 is 4
No. of records with gender 2 in cluster 9 is 2

Records found in cluster 10 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
575      0.0             1.0000             10
1308     0.0             0.6479             10
2033     1.0             1.0000             10
2308     1.0             0.6774             10
3898     0.0             1.0000             10
5454     2.0             0.6774             10
5539     1.0             1.0000             10
5628     2.0             1.0000             10
5825     1.0             1.0000             10
5847     2.0             0.6717             10
6012     0.0             1.0000             10
6048     2.0             0.6796             10
6108     0.0             1.0000             10
6114     1.0             0.6620             10
6335     2.0             1.0000             10
6382     2.0             0.6842             10
6417     2.0             1.0000             10
7843     2.0             1.0000             10
8181     0.0             1.0000             10
8355     2.0             0.6778             10
8738     0.0             1.0000             10
No. of records with gender 0 in cluster 10 is 7
No. of records with gender 1 in cluster 10 is 5
No. of records with gender 2 in cluster 10 is 9

Records found in cluster 11 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
599       1.0             1.0000             11
1268      2.0             1.0000             11
2138      1.0             1.0000             11
2145      0.0             1.0000             11
2146      1.0             1.0000             11
2147      1.0             1.0000             11
2148      1.0             0.3576             11
2156      0.0             1.0000             11
2166      1.0             1.0000             11
2168      0.0             0.6825             11
2169      1.0             1.0000             11
2171      1.0             1.0000             11
2172      0.0             1.0000             11
2182      2.0             1.0000             11
2185      0.0             1.0000             11
2186      0.0             0.3403             11
2187      1.0             1.0000             11
2188      2.0             0.6812             11
2189      0.0             0.6582             11
2191      0.0             1.0000             11
2194      1.0             1.0000             11
2196      1.0             1.0000             11
2204      1.0             0.6587             11
2205      0.0             0.6685             11
2206      1.0             0.6551             11
2207      1.0             1.0000             11
2210      1.0             1.0000             11
2216      1.0             0.6896             11
2217      1.0             0.6832             11
2220      1.0             1.0000             11
2223      2.0             1.0000             11
2682      1.0             0.6473             11
2860      0.0             1.0000             11
2862      0.0             1.0000             11
2863      0.0             0.3370             11
2866      2.0             0.6497             11
2870      2.0             0.6368             11
2872      0.0             0.6855             11
2873      1.0             0.6940             11
3360      1.0             1.0000             11
5548      2.0             1.0000             11
6616      1.0             1.0000             11
7610      2.0             0.6578             11
8509      2.0             0.6731             11
9305      2.0             0.6606             11
10714     0.0             1.0000             11
12324     1.0             1.0000             11
14170     1.0             1.0000             11
15223     0.0             1.0000             11
16735     0.0             0.6563             11
No. of records with gender 0 in cluster 11 is 16
No. of records with gender 1 in cluster 11 is 24
No. of records with gender 2 in cluster 11 is 10

Records found in cluster 12 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
822       0.0             0.6473             12
1536      2.0             0.6591             12
11119     1.0             1.0000             12
11627     2.0             0.6796             12
11727     2.0             1.0000             12
12333     1.0             1.0000             12
12992     0.0             1.0000             12
13486     2.0             1.0000             12
13980     0.0             1.0000             12
14046     0.0             1.0000             12
14958     2.0             1.0000             12
15597     1.0             0.3362             12
16706     0.0             1.0000             12
17090     0.0             1.0000             12
17186     1.0             1.0000             12
17599     0.0             0.6654             12
18270     0.0             1.0000             12
No. of records with gender 0 in cluster 12 is 8
No. of records with gender 1 in cluster 12 is 4
No. of records with gender 2 in cluster 12 is 5

Records found in cluster 13 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
941       2.0             0.6582             13
9955      2.0             1.0000             13
10078     0.0             1.0000             13
10115     0.0             1.0000             13
10194     1.0             1.0000             13
10234     2.0             0.3388             13
10298     0.0             0.3387             13
10354     2.0             0.6852             13
10391     1.0             1.0000             13
15703     1.0             1.0000             13
17106     0.0             1.0000             13
17709     0.0             1.0000             13
No. of records with gender 0 in cluster 13 is 5
No. of records with gender 1 in cluster 13 is 3
No. of records with gender 2 in cluster 13 is 4

Records found in cluster 14 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
1040     1.0             1.0000             14
1045     2.0             0.6789             14
1049     1.0             1.0000             14
1051     2.0             1.0000             14
1052     1.0             1.0000             14
1054     1.0             1.0000             14
1061     0.0             1.0000             14
1064     1.0             0.6498             14
1065     0.0             1.0000             14
No. of records with gender 0 in cluster 14 is 2
No. of records with gender 1 in cluster 14 is 5
No. of records with gender 2 in cluster 14 is 2

Records found in cluster 15 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
1108      1.0             0.6880             15
9382      2.0             1.0000             15
9398      1.0             1.0000             15
9475      0.0             1.0000             15
9496      0.0             1.0000             15
...       ...                ...            ...
15207     1.0             1.0000             15
15391     2.0             1.0000             15
15439     2.0             1.0000             15
15622     2.0             1.0000             15
18398     0.0             0.6709             15

[70 rows x 3 columns]
No. of records with gender 0 in cluster 15 is 19
No. of records with gender 1 in cluster 15 is 25
No. of records with gender 2 in cluster 15 is 26

Records found in cluster 16 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
1203      1.0             1.0000             16
1240      1.0             0.6889             16
2115      0.0             1.0000             16
2381      0.0             1.0000             16
3988      2.0             1.0000             16
5994      2.0             0.6611             16
7988      1.0             0.6734             16
8071      1.0             1.0000             16
10735     0.0             1.0000             16
10738     0.0             1.0000             16
11076     2.0             1.0000             16
11179     2.0             1.0000             16
11484     1.0             1.0000             16
11648     1.0             1.0000             16
11746     0.0             1.0000             16
12054     1.0             1.0000             16
13078     0.0             1.0000             16
14056     2.0             1.0000             16
15064     0.0             0.6534             16
15751     1.0             1.0000             16
15757     1.0             1.0000             16
16465     0.0             1.0000             16
16868     1.0             1.0000             16
17448     0.0             1.0000             16
18208     0.0             1.0000             16
18753     0.0             0.6678             16
No. of records with gender 0 in cluster 16 is 11
No. of records with gender 1 in cluster 16 is 10
No. of records with gender 2 in cluster 16 is 5

Records found in cluster 17 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
1273     0.0             1.0000             17
1605     2.0             1.0000             17
1761     2.0             1.0000             17
1845     1.0             1.0000             17
1987     1.0             1.0000             17
2274     0.0             1.0000             17
3961     0.0             1.0000             17
4092     0.0             0.3411             17
4424     2.0             1.0000             17
5218     2.0             1.0000             17
5336     1.0             1.0000             17
5445     0.0             1.0000             17
5927     2.0             0.6721             17
5980     0.0             1.0000             17
6262     2.0             1.0000             17
6289     1.0             1.0000             17
7003     1.0             1.0000             17
7118     2.0             1.0000             17
7431     1.0             1.0000             17
7540     0.0             0.6859             17
7791     1.0             1.0000             17
8142     2.0             1.0000             17
8601     2.0             0.6700             17
8693     0.0             1.0000             17
9023     1.0             0.6654             17
9265     1.0             1.0000             17
No. of records with gender 0 in cluster 17 is 8
No. of records with gender 1 in cluster 17 is 9
No. of records with gender 2 in cluster 17 is 9

Records found in cluster 18 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
1367     1.0             1.0000             18
2382     1.0             1.0000             18
2897     2.0             1.0000             18
3526     1.0             1.0000             18
4051     2.0             1.0000             18
6140     2.0             0.6679             18
7107     2.0             0.6865             18
7913     2.0             1.0000             18
8836     0.0             0.6645             18
No. of records with gender 0 in cluster 18 is 1
No. of records with gender 1 in cluster 18 is 3
No. of records with gender 2 in cluster 18 is 5

Records found in cluster 19 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
1544     0.0             1.0000             19
2154     1.0             0.6561             19
3341     1.0             1.0000             19
3938     2.0             0.6545             19
4650     2.0             0.3571             19
5424     0.0             1.0000             19
6313     1.0             1.0000             19
8798     1.0             1.0000             19
No. of records with gender 0 in cluster 19 is 2
No. of records with gender 1 in cluster 19 is 4
No. of records with gender 2 in cluster 19 is 2

Records found in cluster 20 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
1844      2.0             1.0000             20
4712      0.0             1.0000             20
5611      2.0             0.6856             20
6066      2.0             0.6668             20
6133      0.0             0.6655             20
6204      0.0             1.0000             20
6291      2.0             1.0000             20
6299      0.0             0.3604             20
6478      2.0             0.6611             20
6668      0.0             1.0000             20
6786      2.0             0.6694             20
7058      1.0             1.0000             20
7102      0.0             1.0000             20
7130      2.0             1.0000             20
7158      1.0             1.0000             20
7176      1.0             1.0000             20
7210      0.0             0.6617             20
7228      2.0             0.6766             20
7259      0.0             1.0000             20
7300      1.0             1.0000             20
7304      1.0             1.0000             20
7332      2.0             0.6573             20
7417      1.0             1.0000             20
7441      1.0             1.0000             20
7502      1.0             0.6617             20
7507      0.0             0.6848             20
7629      2.0             1.0000             20
7697      1.0             1.0000             20
7738      2.0             1.0000             20
7751      2.0             1.0000             20
7759      2.0             1.0000             20
7830      1.0             1.0000             20
7908      2.0             1.0000             20
7975      0.0             1.0000             20
7977      2.0             0.6739             20
7980      2.0             1.0000             20
7987      0.0             1.0000             20
8165      0.0             1.0000             20
8236      0.0             1.0000             20
8264      0.0             1.0000             20
8333      2.0             1.0000             20
8884      1.0             0.6612             20
8947      2.0             1.0000             20
8951      0.0             0.6752             20
9028      1.0             0.6849             20
9225      2.0             1.0000             20
9249      1.0             0.3542             20
11400     1.0             1.0000             20
No. of records with gender 0 in cluster 20 is 15
No. of records with gender 1 in cluster 20 is 14
No. of records with gender 2 in cluster 20 is 19

Records found in cluster 21 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
2445      1.0             1.0000             21
4210      2.0             1.0000             21
4595      1.0             1.0000             21
4621      1.0             1.0000             21
4685      2.0             1.0000             21
...       ...                ...            ...
15313     2.0             1.0000             21
15316     2.0             1.0000             21
15322     0.0             1.0000             21
15324     2.0             0.6344             21
15338     1.0             0.6791             21

[124 rows x 3 columns]
No. of records with gender 0 in cluster 21 is 28
No. of records with gender 1 in cluster 21 is 34
No. of records with gender 2 in cluster 21 is 62

Records found in cluster 22 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
3385     1.0             1.0000             22
3386     1.0             0.6628             22
3388     2.0             1.0000             22
3391     0.0             0.6612             22
3393     1.0             1.0000             22
3394     1.0             1.0000             22
3396     1.0             1.0000             22
3397     0.0             1.0000             22
3398     2.0             1.0000             22
3400     1.0             0.6727             22
3401     2.0             1.0000             22
3402     0.0             1.0000             22
3406     0.0             0.6819             22
3407     1.0             1.0000             22
3411     0.0             1.0000             22
3412     1.0             1.0000             22
3413     1.0             0.7023             22
No. of records with gender 0 in cluster 22 is 5
No. of records with gender 1 in cluster 22 is 9
No. of records with gender 2 in cluster 22 is 3

Records found in cluster 23 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
3581     0.0             1.0000             23
3705     2.0             0.6581             23
3809     2.0             1.0000             23
3906     1.0             0.6422             23
4041     0.0             1.0000             23
4108     2.0             1.0000             23
4111     2.0             1.0000             23
4113     1.0             1.0000             23
4114     1.0             1.0000             23
4116     1.0             1.0000             23
4117     0.0             1.0000             23
4121     0.0             1.0000             23
4134     0.0             0.6692             23
4135     0.0             0.3619             23
4136     2.0             1.0000             23
4137     1.0             1.0000             23
4138     0.0             1.0000             23
4152     2.0             1.0000             23
4153     0.0             1.0000             23
4154     1.0             1.0000             23
4156     1.0             1.0000             23
4272     2.0             1.0000             23
4341     0.0             1.0000             23
4410     2.0             1.0000             23
4508     1.0             1.0000             23
4631     2.0             1.0000             23
4736     2.0             1.0000             23
4840     2.0             1.0000             23
5305     1.0             1.0000             23
No. of records with gender 0 in cluster 23 is 9
No. of records with gender 1 in cluster 23 is 9
No. of records with gender 2 in cluster 23 is 11

Records found in cluster 24 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
3744     0.0             0.6440             24
3927     0.0             1.0000             24
3994     1.0             1.0000             24
4057     2.0             0.3516             24
4300     2.0             0.6736             24
4398     1.0             1.0000             24
4470     2.0             0.6602             24
4544     0.0             1.0000             24
4640     2.0             1.0000             24
4800     2.0             0.6575             24
4883     2.0             1.0000             24
5043     1.0             1.0000             24
5238     1.0             1.0000             24
5325     1.0             0.6645             24
5515     2.0             1.0000             24
5659     1.0             1.0000             24
5978     2.0             1.0000             24
6188     2.0             0.6748             24
6440     2.0             1.0000             24
6562     0.0             1.0000             24
6671     2.0             1.0000             24
6749     1.0             1.0000             24
6826     2.0             0.6933             24
7050     0.0             0.6736             24
No. of records with gender 0 in cluster 24 is 5
No. of records with gender 1 in cluster 24 is 7
No. of records with gender 2 in cluster 24 is 12

Records found in cluster 25 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4012     1.0             1.0000             25
4097     0.0             0.6706             25
4177     0.0             0.6729             25
4219     0.0             1.0000             25
4226     2.0             1.0000             25
...      ...                ...            ...
5777     2.0             0.6638             25
5809     0.0             1.0000             25
5849     0.0             0.6792             25
5881     2.0             1.0000             25
5910     0.0             0.6787             25

[94 rows x 3 columns]
No. of records with gender 0 in cluster 25 is 33
No. of records with gender 1 in cluster 25 is 23
No. of records with gender 2 in cluster 25 is 38

Records found in cluster 26 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4498      0.0             1.0000             26
6783      2.0             1.0000             26
10814     0.0             1.0000             26
14468     1.0             1.0000             26
14630     1.0             1.0000             26
14664     2.0             1.0000             26
14804     1.0             1.0000             26
15040     1.0             1.0000             26
15267     1.0             0.6608             26
16204     1.0             1.0000             26
No. of records with gender 0 in cluster 26 is 2
No. of records with gender 1 in cluster 26 is 6
No. of records with gender 2 in cluster 26 is 2

Records found in cluster 27 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4572      1.0             1.0000             27
4606      0.0             1.0000             27
4627      2.0             1.0000             27
4690      0.0             0.6763             27
4746      1.0             1.0000             27
...       ...                ...            ...
8052      0.0             0.7050             27
8391      0.0             1.0000             27
8411      1.0             1.0000             27
18789     0.0             1.0000             27
18803     1.0             1.0000             27

[148 rows x 3 columns]
No. of records with gender 0 in cluster 27 is 46
No. of records with gender 1 in cluster 27 is 36
No. of records with gender 2 in cluster 27 is 66

Records found in cluster 28 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4772     2.0             1.0000             28
4789     2.0             1.0000             28
4853     0.0             1.0000             28
4917     1.0             0.6571             28
4949     2.0             1.0000             28
...      ...                ...            ...
9206     2.0             0.3398             28
9215     1.0             0.6818             28
9253     2.0             1.0000             28
9278     1.0             1.0000             28
9294     0.0             1.0000             28

[127 rows x 3 columns]
No. of records with gender 0 in cluster 28 is 31
No. of records with gender 1 in cluster 28 is 31
No. of records with gender 2 in cluster 28 is 65

Records found in cluster 29 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4965      2.0             0.6695             29
5384      2.0             1.0000             29
5485      2.0             1.0000             29
5683      2.0             1.0000             29
5800      1.0             1.0000             29
7510      0.0             1.0000             29
8081      2.0             1.0000             29
8479      2.0             0.3625             29
8557      0.0             1.0000             29
8655      1.0             1.0000             29
8987      2.0             1.0000             29
9070      0.0             1.0000             29
9289      2.0             1.0000             29
9313      2.0             0.6841             29
10058     2.0             1.0000             29
10070     1.0             1.0000             29
10084     0.0             1.0000             29
10092     1.0             1.0000             29
10102     2.0             1.0000             29
10116     2.0             1.0000             29
10131     0.0             1.0000             29
10143     2.0             1.0000             29
10167     1.0             0.3495             29
11175     0.0             1.0000             29
No. of records with gender 0 in cluster 29 is 6
No. of records with gender 1 in cluster 29 is 5
No. of records with gender 2 in cluster 29 is 13

Records found in cluster 30 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4995      2.0             1.0000             30
5372      2.0             1.0000             30
5627      2.0             0.6559             30
5919      2.0             1.0000             30
6208      1.0             0.6543             30
6496      2.0             0.6716             30
7060      1.0             0.6890             30
7439      0.0             1.0000             30
7683      1.0             0.6699             30
7894      0.0             1.0000             30
7902      0.0             1.0000             30
8408      0.0             1.0000             30
8933      1.0             1.0000             30
10448     2.0             0.6544             30
No. of records with gender 0 in cluster 30 is 4
No. of records with gender 1 in cluster 30 is 4
No. of records with gender 2 in cluster 30 is 6

Records found in cluster 31 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
5147      1.0             1.0000             31
17729     0.0             1.0000             31
17730     0.0             1.0000             31
17731     2.0             1.0000             31
17733     0.0             1.0000             31
17734     1.0             1.0000             31
17737     0.0             1.0000             31
17739     1.0             1.0000             31
17741     0.0             1.0000             31
17761     0.0             1.0000             31
17766     1.0             1.0000             31
17767     2.0             0.6637             31
17768     1.0             1.0000             31
17823     2.0             1.0000             31
17827     1.0             0.6773             31
17874     2.0             1.0000             31
17875     1.0             1.0000             31
17898     1.0             1.0000             31
17901     0.0             0.6676             31
17928     0.0             1.0000             31
17931     0.0             1.0000             31
17932     1.0             1.0000             31
17933     1.0             0.6807             31
17956     0.0             1.0000             31
17962     2.0             1.0000             31
17969     0.0             1.0000             31
17974     0.0             1.0000             31
17975     1.0             1.0000             31
17991     1.0             1.0000             31
18042     1.0             1.0000             31
18047     1.0             1.0000             31
18049     2.0             1.0000             31
18052     0.0             0.6660             31
18055     1.0             1.0000             31
18057     1.0             0.6557             31
18062     1.0             1.0000             31
18117     0.0             0.6664             31
18118     0.0             1.0000             31
18124     0.0             1.0000             31
18170     0.0             1.0000             31
18175     1.0             1.0000             31
18215     2.0             0.6545             31
18218     1.0             1.0000             31
18228     1.0             1.0000             31
18229     0.0             0.6827             31
18230     2.0             1.0000             31
18231     0.0             1.0000             31
18233     1.0             0.3352             31
18236     0.0             1.0000             31
18354     0.0             1.0000             31
18368     1.0             1.0000             31
18371     2.0             1.0000             31
18373     0.0             1.0000             31
18374     1.0             1.0000             31
No. of records with gender 0 in cluster 31 is 22
No. of records with gender 1 in cluster 31 is 23
No. of records with gender 2 in cluster 31 is 9

Records found in cluster 32 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
5206      1.0             1.0000             32
5629      2.0             1.0000             32
5640      0.0             1.0000             32
5944      1.0             1.0000             32
6093      1.0             0.6653             32
6157      2.0             0.6567             32
6174      2.0             0.6619             32
6409      0.0             1.0000             32
6514      1.0             1.0000             32
13356     1.0             1.0000             32
No. of records with gender 0 in cluster 32 is 2
No. of records with gender 1 in cluster 32 is 5
No. of records with gender 2 in cluster 32 is 3

Records found in cluster 33 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
5665     1.0             1.0000             33
6109     0.0             1.0000             33
6206     1.0             1.0000             33
6381     1.0             1.0000             33
6390     1.0             1.0000             33
6502     0.0             1.0000             33
6576     2.0             1.0000             33
6580     2.0             1.0000             33
6664     2.0             1.0000             33
6685     2.0             1.0000             33
6789     2.0             1.0000             33
6858     1.0             1.0000             33
6876     0.0             1.0000             33
6992     2.0             1.0000             33
7040     1.0             1.0000             33
7043     0.0             1.0000             33
7065     2.0             1.0000             33
7109     1.0             1.0000             33
7148     0.0             0.6750             33
7273     1.0             1.0000             33
7399     0.0             0.3272             33
7421     2.0             0.6802             33
7430     2.0             0.6812             33
7440     0.0             1.0000             33
7581     1.0             1.0000             33
7586     0.0             1.0000             33
7611     0.0             0.6666             33
7614     2.0             0.6866             33
7622     2.0             1.0000             33
7626     0.0             1.0000             33
7655     1.0             1.0000             33
7669     2.0             1.0000             33
7679     1.0             1.0000             33
7705     1.0             1.0000             33
7757     2.0             1.0000             33
7793     0.0             0.6691             33
7817     0.0             1.0000             33
7820     0.0             1.0000             33
7827     2.0             0.3472             33
7888     2.0             0.6506             33
7897     0.0             0.6803             33
7959     0.0             0.6823             33
8033     0.0             0.6701             33
8055     0.0             1.0000             33
8062     1.0             1.0000             33
8118     1.0             1.0000             33
8177     2.0             1.0000             33
8251     0.0             0.6624             33
8358     2.0             0.6965             33
8385     1.0             1.0000             33
8466     0.0             1.0000             33
8470     1.0             1.0000             33
No. of records with gender 0 in cluster 33 is 19
No. of records with gender 1 in cluster 33 is 16
No. of records with gender 2 in cluster 33 is 17

Records found in cluster 34 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
5853      2.0             0.6619             34
6244      2.0             1.0000             34
8255      2.0             0.6672             34
9773      0.0             0.6607             34
10211     1.0             1.0000             34
10698     1.0             0.6795             34
12736     1.0             0.6619             34
14216     1.0             1.0000             34
14307     2.0             0.6617             34
15333     1.0             1.0000             34
15424     0.0             0.6608             34
15800     1.0             1.0000             34
16873     1.0             1.0000             34
17596     1.0             1.0000             34
18337     1.0             1.0000             34
No. of records with gender 0 in cluster 34 is 2
No. of records with gender 1 in cluster 34 is 9
No. of records with gender 2 in cluster 34 is 4

Records found in cluster 35 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
6080     1.0             1.0000             35
7002     2.0             1.0000             35
7016     0.0             1.0000             35
7091     1.0             0.6642             35
7095     2.0             1.0000             35
...      ...                ...            ...
9150     1.0             1.0000             35
9165     0.0             1.0000             35
9216     2.0             0.6519             35
9221     2.0             1.0000             35
9243     0.0             0.3506             35

[62 rows x 3 columns]
No. of records with gender 0 in cluster 35 is 13
No. of records with gender 1 in cluster 35 is 20
No. of records with gender 2 in cluster 35 is 29

Records found in cluster 36 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
7289      0.0             1.0000             36
12796     1.0             1.0000             36
13303     1.0             1.0000             36
13417     1.0             1.0000             36
13502     1.0             1.0000             36
13716     1.0             0.6830             36
13901     2.0             0.6611             36
14140     0.0             0.6645             36
14214     2.0             1.0000             36
14269     2.0             0.6868             36
14337     1.0             1.0000             36
14412     1.0             1.0000             36
14483     0.0             1.0000             36
14645     1.0             1.0000             36
15443     2.0             1.0000             36
15534     0.0             1.0000             36
15807     0.0             1.0000             36
15916     1.0             1.0000             36
16188     1.0             1.0000             36
16418     2.0             1.0000             36
16672     1.0             1.0000             36
16725     1.0             1.0000             36
17269     0.0             1.0000             36
17351     1.0             0.6556             36
17442     1.0             1.0000             36
17842     0.0             1.0000             36
18412     2.0             0.6690             36
18510     1.0             1.0000             36
18731     1.0             1.0000             36
18738     2.0             1.0000             36
No. of records with gender 0 in cluster 36 is 7
No. of records with gender 1 in cluster 36 is 16
No. of records with gender 2 in cluster 36 is 7

Records found in cluster 37 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
7381      2.0             1.0000             37
7470      1.0             0.6810             37
7542      0.0             1.0000             37
7616      2.0             0.6675             37
7675      2.0             1.0000             37
7744      2.0             0.6761             37
7795      1.0             0.6602             37
7871      2.0             1.0000             37
7946      1.0             1.0000             37
8010      1.0             1.0000             37
8069      1.0             1.0000             37
8125      1.0             1.0000             37
8180      1.0             0.6850             37
8253      2.0             1.0000             37
8395      1.0             1.0000             37
8477      1.0             1.0000             37
8532      1.0             1.0000             37
8587      2.0             1.0000             37
8657      1.0             1.0000             37
8755      0.0             0.6707             37
8810      0.0             1.0000             37
8906      1.0             0.7047             37
8977      1.0             1.0000             37
9039      1.0             1.0000             37
9101      0.0             0.3496             37
9172      0.0             1.0000             37
9247      2.0             0.6622             37
9317      0.0             1.0000             37
17122     2.0             0.6583             37
No. of records with gender 0 in cluster 37 is 6
No. of records with gender 1 in cluster 37 is 14
No. of records with gender 2 in cluster 37 is 9

Records found in cluster 38 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
9515      0.0             0.6648             38
10396     1.0             1.0000             38
10608     1.0             1.0000             38
10796     0.0             0.6912             38
10981     0.0             1.0000             38
11477     2.0             1.0000             38
11770     2.0             1.0000             38
12451     2.0             1.0000             38
12803     1.0             0.6667             38
12996     1.0             1.0000             38
13263     2.0             0.6743             38
13436     0.0             1.0000             38
14141     0.0             1.0000             38
14290     0.0             1.0000             38
14473     0.0             1.0000             38
14878     2.0             0.6502             38
15088     0.0             0.6581             38
15727     2.0             1.0000             38
16605     0.0             0.6578             38
16973     0.0             1.0000             38
17197     1.0             1.0000             38
17330     0.0             1.0000             38
17728     1.0             0.6702             38
18071     2.0             1.0000             38
18531     2.0             1.0000             38
No. of records with gender 0 in cluster 38 is 11
No. of records with gender 1 in cluster 38 is 6
No. of records with gender 2 in cluster 38 is 8

Records found in cluster 39 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
9856      2.0             1.0000             39
10008     0.0             1.0000             39
10075     1.0             1.0000             39
10150     1.0             1.0000             39
10237     2.0             1.0000             39
10318     2.0             1.0000             39
10385     1.0             0.3592             39
10471     1.0             1.0000             39
10633     2.0             0.6545             39
10716     0.0             0.6794             39
10776     0.0             1.0000             39
10849     2.0             1.0000             39
10964     2.0             1.0000             39
11050     0.0             1.0000             39
11118     2.0             0.6666             39
11190     1.0             1.0000             39
11251     1.0             0.6715             39
11356     2.0             1.0000             39
11429     2.0             1.0000             39
11502     2.0             1.0000             39
11590     0.0             1.0000             39
11653     0.0             1.0000             39
11767     2.0             1.0000             39
11842     1.0             1.0000             39
11930     1.0             1.0000             39
12045     1.0             1.0000             39
12132     1.0             0.6858             39
12195     0.0             0.6564             39
12284     1.0             1.0000             39
12397     0.0             1.0000             39
12507     2.0             1.0000             39
12659     2.0             1.0000             39
12754     2.0             0.6615             39
No. of records with gender 0 in cluster 39 is 8
No. of records with gender 1 in cluster 39 is 11
No. of records with gender 2 in cluster 39 is 14

Records found in cluster 40 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
10710     1.0             1.0000             40
11127     2.0             1.0000             40
11929     0.0             1.0000             40
12857     0.0             1.0000             40
12921     1.0             1.0000             40
12962     1.0             1.0000             40
13047     2.0             0.6509             40
13110     0.0             0.6414             40
13132     0.0             0.6527             40
13159     2.0             1.0000             40
13221     1.0             1.0000             40
13254     0.0             0.6618             40
13289     0.0             0.6711             40
17886     2.0             1.0000             40
No. of records with gender 0 in cluster 40 is 6
No. of records with gender 1 in cluster 40 is 4
No. of records with gender 2 in cluster 40 is 4

Records found in cluster 41 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
10888     1.0             1.0000             41
12233     2.0             0.3374             41
13608     1.0             1.0000             41
14053     0.0             1.0000             41
14500     2.0             0.3449             41
15128     1.0             1.0000             41
15717     2.0             1.0000             41
16776     2.0             1.0000             41
No. of records with gender 0 in cluster 41 is 1
No. of records with gender 1 in cluster 41 is 3
No. of records with gender 2 in cluster 41 is 4

Records found in cluster 42 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
16388     2.0             1.0000             42
17041     1.0             1.0000             42
17154     1.0             1.0000             42
17297     0.0             1.0000             42
17565     1.0             1.0000             42
17677     1.0             1.0000             42
17868     2.0             0.3354             42
18092     0.0             1.0000             42
18246     1.0             1.0000             42
18399     0.0             1.0000             42
18527     1.0             1.0000             42
18646     0.0             1.0000             42
18759     0.0             0.6386             42
No. of records with gender 0 in cluster 42 is 5
No. of records with gender 1 in cluster 42 is 6
No. of records with gender 2 in cluster 42 is 2

Records found in cluster 43 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
17732     2.0             0.3417             43
17735     0.0             1.0000             43
17736     0.0             1.0000             43
17738     1.0             1.0000             43
17740     1.0             1.0000             43
...       ...                ...            ...
18367     2.0             1.0000             43
18369     2.0             1.0000             43
18370     0.0             0.6591             43
18372     2.0             1.0000             43
18375     0.0             1.0000             43

[98 rows x 3 columns]
No. of records with gender 0 in cluster 43 is 44
No. of records with gender 1 in cluster 43 is 35
No. of records with gender 2 in cluster 43 is 19
Records classified as noise
Empty DataFrame
Columns: [gender, gender:confidence, Cluster_Label]
Index: []

==================================================
EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES
==================================================

Data with Only Numerical and Categorical Features
<class 'pandas.core.frame.DataFrame'>
Index: 19970 entries, 0 to 18833
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   retweets_per_day        18836 non-null  float64
 1   favorites_per_day       18836 non-null  float64
 2   tweets_per_day          18836 non-null  float64
 3   profile_created_year    18836 non-null  float64
 4   tweet_created_year      18836 non-null  float64
 5   tweet_location_encoded  18836 non-null  float64
 6   user_timezone_encoded   18836 non-null  float64
 7   gender                  18836 non-null  float64
 8   gender:confidence       18836 non-null  float64
dtypes: float64(9)
memory usage: 1.5 MB
None

Removing NaN values...
Dropping gender and gender:confidence...

Dataset for Exp 2
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   retweets_per_day        17702 non-null  float64
 1   favorites_per_day       17702 non-null  float64
 2   tweets_per_day          17702 non-null  float64
 3   profile_created_year    17702 non-null  float64
 4   tweet_created_year      17702 non-null  float64
 5   tweet_location_encoded  17702 non-null  float64
 6   user_timezone_encoded   17702 non-null  float64
dtypes: float64(7)
memory usage: 1.1 MB
None

   retweets_per_day  favorites_per_day  tweets_per_day  profile_created_year  \
0         -0.100504          -0.318861        1.467429              0.497680   
1         -0.100504          -0.313379       -0.582882              0.028171   
2          9.949874           0.437997       -0.593862              0.967189   
3         -0.100504          -0.306100       -0.691862             -1.380358   
4         -0.100504           3.133457       -0.075048              0.967189   

   tweet_created_year  tweet_location_encoded  user_timezone_encoded  
0                 0.0                0.000053               0.001699  
1                 0.0                0.363294               0.127309  
2                 0.0                0.000053               0.002071  
3                 0.0                0.000159               0.105755  
4                 0.0                0.363294               0.381344  
Applying UMAP for dim reduction...
[I 2024-09-20 16:23:48,010] A new study created in memory with name: no-name-f9550dca-d1e6-48ba-a676-9e841b5672d2
(17702, 3)

Performing K-Means Clustering...
[I 2024-09-20 16:23:51,598] Trial 0 finished with value: 0.3289636639097889 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.3289636639097889.
[I 2024-09-20 16:23:55,202] Trial 1 finished with value: 0.37002461668723785 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 1 with value: 0.37002461668723785.
[I 2024-09-20 16:23:58,694] Trial 2 finished with value: 0.34236426949671833 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 1 with value: 0.37002461668723785.
[I 2024-09-20 16:24:02,334] Trial 3 finished with value: 0.42740275911790543 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543.
[I 2024-09-20 16:24:05,980] Trial 4 finished with value: 0.3327138746593811 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543.
[I 2024-09-20 16:24:09,807] Trial 5 finished with value: 0.43672661408383706 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 5 with value: 0.43672661408383706.
[I 2024-09-20 16:24:13,417] Trial 6 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 5 with value: 0.43672661408383706.
[I 2024-09-20 16:24:17,670] Trial 7 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:21,464] Trial 8 finished with value: 0.4278636679375091 and parameters: {'n_clusters': 5, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:25,371] Trial 9 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:29,537] Trial 10 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:33,153] Trial 11 finished with value: 0.3527991920640622 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:37,183] Trial 12 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:41,210] Trial 13 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:44,823] Trial 14 finished with value: 0.35844394490438636 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
Best params: {'n_clusters': 2, 'init': 'random'}
No description has been provided for this image
No description has been provided for this image
[I 2024-09-20 16:24:49,788] A new study created in memory with name: no-name-9347cc24-2c6f-4ad3-b792-5f023401ccdb
The Silhouette score is 0.7076370412066645
The Callinski index is 4482.755124226919

Dataset with Labels from KMeans in Exp 2
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              1
1     0.0             1.0000              1
2     0.0             0.6625              0
3     0.0             1.0000              1
4     1.0             1.0000              1

Records found in cluster 0 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
2         0.0             0.6625              0
257       1.0             1.0000              0
286       2.0             1.0000              0
392       2.0             0.6576              0
429       1.0             1.0000              0
...       ...                ...            ...
18649     0.0             1.0000              0
18720     0.0             1.0000              0
18765     1.0             1.0000              0
18784     2.0             1.0000              0
18796     0.0             0.6760              0

[371 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 135
No. of records with gender 1 in cluster 0 is 103
No. of records with gender 2 in cluster 0 is 133

Records found in cluster 1 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              1
1         0.0             1.0000              1
3         0.0             1.0000              1
4         1.0             1.0000              1
5         1.0             1.0000              1
...       ...                ...            ...
18829     1.0             1.0000              1
18831     0.0             0.6466              1
18832     1.0             1.0000              1
18834     1.0             1.0000              1
18835     0.0             0.6772              1

[17331 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 5708
No. of records with gender 1 in cluster 1 is 6098
No. of records with gender 2 in cluster 1 is 5525

Performing DBSCAN Clustering...
[I 2024-09-20 16:24:59,626] Trial 0 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8612774559273246, 'min_samples': 17}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:09,593] Trial 1 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8222959595129316, 'min_samples': 16}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:19,610] Trial 2 finished with value: 0.7286394402690954 and parameters: {'eps': 1.4969908447691442, 'min_samples': 15}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:27,226] Trial 3 finished with value: 0.5245068941307643 and parameters: {'eps': 0.572252884608168, 'min_samples': 14}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:37,404] Trial 4 finished with value: 0.7535863974003295 and parameters: {'eps': 1.9348801547784897, 'min_samples': 14}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:25:47,748] Trial 5 finished with value: 0.7378543022519933 and parameters: {'eps': 1.6937724424520773, 'min_samples': 17}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:25:57,605] Trial 6 finished with value: 0.7283183555413514 and parameters: {'eps': 1.8007992794733476, 'min_samples': 5}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:05,851] Trial 7 finished with value: 0.41821414951083047 and parameters: {'eps': 0.7293348370290341, 'min_samples': 7}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:14,792] Trial 8 finished with value: 0.7014382428001366 and parameters: {'eps': 1.0030585304662154, 'min_samples': 9}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:24,590] Trial 9 finished with value: 0.7561451190368602 and parameters: {'eps': 1.871623880808503, 'min_samples': 8}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:33,948] Trial 10 finished with value: 0.5090875645314461 and parameters: {'eps': 1.2834097282566614, 'min_samples': 3}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:43,291] Trial 11 finished with value: 0.7175191411766557 and parameters: {'eps': 1.2828100669180271, 'min_samples': 11}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:53,530] Trial 12 finished with value: 0.7582461095915987 and parameters: {'eps': 1.9873182476645224, 'min_samples': 12}. Best is trial 12 with value: 0.7582461095915987.
[I 2024-09-20 16:26:58,050] Trial 13 finished with value: -0.4771117854083832 and parameters: {'eps': 0.133931940094827, 'min_samples': 11}. Best is trial 12 with value: 0.7582461095915987.
[I 2024-09-20 16:27:07,751] Trial 14 finished with value: 0.727389258590315 and parameters: {'eps': 1.5756102900269209, 'min_samples': 20}. Best is trial 12 with value: 0.7582461095915987.
Found best params: {'eps': 1.9873182476645224, 'min_samples': 12}
No description has been provided for this image
The Silhouette score is 0.7582461095915987
The Callinski index is 336.17121436944564

Dataset with Labels from DBSCAN in Exp 2
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 2
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[17677 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5832
No. of records with gender 1 in cluster 0 is 6195
No. of records with gender 2 in cluster 0 is 5650
Records classified as noise
       gender  gender:confidence  Cluster_Label
1116      2.0             1.0000             -1
2115      0.0             1.0000             -1
2502      0.0             0.6785             -1
2869      2.0             0.6489             -1
3301      0.0             1.0000             -1
4127      2.0             1.0000             -1
4150      1.0             1.0000             -1
5613      1.0             1.0000             -1
6722      1.0             1.0000             -1
7666      2.0             1.0000             -1
9210      0.0             1.0000             -1
10926     0.0             0.6513             -1
12010     0.0             1.0000             -1
12504     0.0             1.0000             -1
12668     0.0             1.0000             -1
13204     1.0             1.0000             -1
13331     1.0             1.0000             -1
13788     1.0             1.0000             -1
14567     2.0             1.0000             -1
15940     0.0             1.0000             -1
16326     2.0             0.3515             -1
17960     0.0             1.0000             -1
18012     0.0             1.0000             -1
18585     2.0             1.0000             -1
18763     2.0             1.0000             -1

==================================================
EXP 3: USING ONLY TEXT FEATURES
==================================================
Dataset for Exp 3
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3000 entries, desc_0 to text_1499
dtypes: float64(3000)
memory usage: 405.3 MB
None

   desc_0  desc_1  desc_2    desc_3  desc_4  desc_5  desc_6  desc_7  desc_8  \
0     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   

   desc_9  ...  text_1490  text_1491  text_1492  text_1493  text_1494  \
0     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
1     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
2     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
3     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
4     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   

   text_1495  text_1496  text_1497  text_1498  text_1499  
0  -0.142733  -0.100504        0.0        0.0        0.0  
1  -0.142733  -0.100504        0.0        0.0        0.0  
2  -0.142733  -0.100504        0.0        0.0        0.0  
3  -0.142733  -0.100504        0.0        0.0        0.0  
4  -0.142733  -0.100504        0.0        0.0        0.0  

[5 rows x 3000 columns]
Applying UMAP for dim reduction...
[I 2024-09-20 16:30:24,130] A new study created in memory with name: no-name-36a03b0d-863a-4fa2-a02e-19e458c477ae
Performing K-Means Clustering...
[I 2024-09-20 16:30:29,152] Trial 0 finished with value: 0.3791390061378479 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 0 with value: 0.3791390061378479.
[I 2024-09-20 16:30:34,447] Trial 1 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:39,434] Trial 2 finished with value: 0.37424296140670776 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:44,536] Trial 3 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:49,831] Trial 4 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:30:54,927] Trial 5 finished with value: 0.4103359282016754 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:00,242] Trial 6 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:05,516] Trial 7 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:10,792] Trial 8 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:15,863] Trial 9 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:21,412] Trial 10 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:26,834] Trial 11 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:32,286] Trial 12 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:37,423] Trial 13 finished with value: 0.35118553042411804 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:42,827] Trial 14 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
Best params: {'n_clusters': 4, 'init': 'random'}
No description has been provided for this image
No description has been provided for this image
The Silhouette score is 0.718830943107605
The Callinski index is 10019.619140625

Dataset with Labels from KMeans in Exp 3
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              2
1     0.0             1.0000              2
2     0.0             0.6625              2
3     0.0             1.0000              2
4     1.0             1.0000              2

Records found in cluster 0 from KMeans in Exp 3
       gender  gender:confidence  Cluster_Label
42        2.0             1.0000              0
62        1.0             1.0000              0
166       0.0             1.0000              0
173       2.0             1.0000              0
190       2.0             0.6780              0
...       ...                ...            ...
18624     1.0             1.0000              0
18654     0.0             1.0000              0
18656     1.0             1.0000              0
18673     0.0             1.0000              0
18722     1.0             0.3371              0

[865 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 307
No. of records with gender 1 in cluster 0 is 272
No. of records with gender 2 in cluster 0 is 286

Records found in cluster 1 from KMeans in Exp 3
       gender  gender:confidence  Cluster_Label
113       1.0             1.0000              1
230       1.0             0.6755              1
502       0.0             1.0000              1
578       1.0             1.0000              1
644       0.0             1.0000              1
...       ...                ...            ...
17448     0.0             1.0000              1
18208     0.0             1.0000              1
18679     1.0             1.0000              1
18753     0.0             0.6678              1
18824     2.0             1.0000              1

[469 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 122
No. of records with gender 1 in cluster 1 is 146
No. of records with gender 2 in cluster 1 is 201

Records found in cluster 2 from KMeans in Exp 3
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              2
1         0.0             1.0000              2
2         0.0             0.6625              2
3         0.0             1.0000              2
4         1.0             1.0000              2
...       ...                ...            ...
18829     1.0             1.0000              2
18831     0.0             0.6466              2
18832     1.0             1.0000              2
18834     1.0             1.0000              2
18835     0.0             0.6772              2

[15810 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 5260
No. of records with gender 1 in cluster 2 is 5607
No. of records with gender 2 in cluster 2 is 4943

Records found in cluster 3 from KMeans in Exp 3
[I 2024-09-20 16:31:50,732] A new study created in memory with name: no-name-3c57886b-d112-4b57-9312-be650d49f11f
       gender  gender:confidence  Cluster_Label
261       1.0                1.0              3
336       0.0                1.0              3
575       0.0                1.0              3
929       1.0                1.0              3
1172      0.0                1.0              3
...       ...                ...            ...
18510     1.0                1.0              3
18609     1.0                1.0              3
18731     1.0                1.0              3
18738     2.0                1.0              3
18764     1.0                1.0              3

[558 rows x 3 columns]
No. of records with gender 0 in cluster 3 is 154
No. of records with gender 1 in cluster 3 is 176
No. of records with gender 2 in cluster 3 is 228

Performing DBSCAN Clustering...
[I 2024-09-20 16:31:56,283] Trial 0 finished with value: 0.07267794013023376 and parameters: {'eps': 0.224350065881816, 'min_samples': 12}. Best is trial 0 with value: 0.07267794013023376.
[I 2024-09-20 16:32:03,308] Trial 1 finished with value: 0.5760640501976013 and parameters: {'eps': 1.981346472478528, 'min_samples': 13}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:09,091] Trial 2 finished with value: 0.456826776266098 and parameters: {'eps': 0.5994244424252012, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:15,888] Trial 3 finished with value: 0.5654299259185791 and parameters: {'eps': 1.6306995770185833, 'min_samples': 12}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:21,989] Trial 4 finished with value: 0.40619421005249023 and parameters: {'eps': 0.9888580285943404, 'min_samples': 7}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:27,401] Trial 5 finished with value: 0.0672125294804573 and parameters: {'eps': 0.2212632309988171, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:33,226] Trial 6 finished with value: 0.3273886442184448 and parameters: {'eps': 0.8245746726811352, 'min_samples': 5}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:39,816] Trial 7 finished with value: 0.5654299259185791 and parameters: {'eps': 1.5980627491197157, 'min_samples': 11}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:46,496] Trial 8 finished with value: 0.5481722950935364 and parameters: {'eps': 1.6967788919940698, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:53,073] Trial 9 finished with value: 0.45212018489837646 and parameters: {'eps': 1.5229068478428347, 'min_samples': 9}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:00,076] Trial 10 finished with value: 0.5555833578109741 and parameters: {'eps': 1.9667767292648453, 'min_samples': 20}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:06,982] Trial 11 finished with value: 0.5760640501976013 and parameters: {'eps': 1.9662951357354617, 'min_samples': 14}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:13,834] Trial 12 finished with value: 0.5699435472488403 and parameters: {'eps': 1.964896277269623, 'min_samples': 16}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:20,152] Trial 13 finished with value: 0.563732922077179 and parameters: {'eps': 1.3014195484895565, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:26,409] Trial 14 finished with value: 0.5307610034942627 and parameters: {'eps': 1.271279456944655, 'min_samples': 18}. Best is trial 1 with value: 0.5760640501976013.
Found best params: {'eps': 1.981346472478528, 'min_samples': 13}
No description has been provided for this image
The Silhouette score is 0.5760640501976013
The Callinski index is 1357.848876953125

Dataset with Labels from DBSCAN in Exp 3
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[15963 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5340
No. of records with gender 1 in cluster 0 is 5665
No. of records with gender 2 in cluster 0 is 4958

Records found in cluster 1 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
42        2.0              1.000              1
190       2.0              0.678              1
211       2.0              1.000              1
252       2.0              1.000              1
255       1.0              1.000              1
...       ...                ...            ...
18546     1.0              1.000              1
18573     0.0              1.000              1
18584     1.0              1.000              1
18624     1.0              1.000              1
18656     1.0              1.000              1

[148 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 44
No. of records with gender 1 in cluster 1 is 52
No. of records with gender 2 in cluster 1 is 52

Records found in cluster 2 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
113       1.0             1.0000              2
6301      2.0             1.0000              2
6302      2.0             1.0000              2
6309      1.0             0.3750              2
6311      2.0             1.0000              2
6318      1.0             1.0000              2
6319      0.0             0.6471              2
6327      2.0             0.6733              2
6332      0.0             1.0000              2
6358      2.0             0.6692              2
6366      2.0             0.6662              2
6373      2.0             1.0000              2
6374      2.0             1.0000              2
6378      0.0             1.0000              2
6381      1.0             1.0000              2
6383      2.0             0.6754              2
6389      0.0             1.0000              2
6390      1.0             1.0000              2
6391      1.0             1.0000              2
6393      2.0             1.0000              2
6397      1.0             1.0000              2
6398      2.0             1.0000              2
6399      1.0             1.0000              2
8850      0.0             1.0000              2
11402     0.0             1.0000              2
12450     1.0             1.0000              2
13813     0.0             1.0000              2
No. of records with gender 0 in cluster 2 is 7
No. of records with gender 1 in cluster 2 is 9
No. of records with gender 2 in cluster 2 is 11

Records found in cluster 3 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
230       1.0             0.6755              3
7500      1.0             1.0000              3
7502      1.0             0.6617              3
7505      0.0             1.0000              3
7507      0.0             0.6848              3
7508      0.0             1.0000              3
7509      1.0             1.0000              3
7510      0.0             1.0000              3
7511      2.0             1.0000              3
7512      1.0             0.6739              3
7513      0.0             1.0000              3
7524      1.0             1.0000              3
7531      2.0             1.0000              3
7532      2.0             1.0000              3
7534      2.0             1.0000              3
7581      1.0             1.0000              3
7586      0.0             1.0000              3
7593      2.0             1.0000              3
7596      0.0             1.0000              3
7598      2.0             1.0000              3
12002     0.0             1.0000              3
No. of records with gender 0 in cluster 3 is 8
No. of records with gender 1 in cluster 3 is 7
No. of records with gender 2 in cluster 3 is 6

Records found in cluster 4 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
261       1.0             1.0000              4
336       0.0             1.0000              4
929       1.0             1.0000              4
1172      0.0             1.0000              4
1455      1.0             0.6678              4
1686      0.0             1.0000              4
3378      2.0             0.6688              4
3521      2.0             1.0000              4
3544      2.0             1.0000              4
5605      1.0             1.0000              4
5611      2.0             0.6856              4
5616      2.0             1.0000              4
5625      0.0             1.0000              4
5626      2.0             0.6589              4
5632      2.0             0.6651              4
5643      0.0             1.0000              4
5644      1.0             0.6725              4
5661      2.0             1.0000              4
5665      1.0             1.0000              4
5669      2.0             1.0000              4
5670      1.0             0.6752              4
5671      2.0             0.3424              4
5672      2.0             1.0000              4
5673      0.0             0.6761              4
5674      1.0             1.0000              4
5675      2.0             1.0000              4
5679      2.0             0.6816              4
5681      1.0             1.0000              4
5683      2.0             1.0000              4
5685      0.0             1.0000              4
5686      2.0             1.0000              4
5687      2.0             0.6799              4
5689      2.0             0.6805              4
5696      0.0             1.0000              4
5697      0.0             0.6892              4
9387      2.0             1.0000              4
10074     1.0             0.6741              4
10109     1.0             1.0000              4
10453     2.0             1.0000              4
10729     2.0             1.0000              4
10792     0.0             1.0000              4
10928     2.0             0.6605              4
11125     0.0             1.0000              4
11755     1.0             1.0000              4
11820     1.0             1.0000              4
13497     2.0             1.0000              4
14339     1.0             1.0000              4
14581     1.0             1.0000              4
14835     1.0             1.0000              4
15275     1.0             1.0000              4
15841     1.0             1.0000              4
15900     1.0             1.0000              4
15985     2.0             1.0000              4
16529     1.0             1.0000              4
17107     0.0             0.6597              4
17846     1.0             1.0000              4
18764     1.0             1.0000              4
No. of records with gender 0 in cluster 4 is 12
No. of records with gender 1 in cluster 4 is 22
No. of records with gender 2 in cluster 4 is 23

Records found in cluster 5 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
264       0.0             1.0000              5
2740      0.0             1.0000              5
4012      1.0             1.0000              5
4097      0.0             0.6706              5
4100      2.0             1.0000              5
...       ...                ...            ...
16986     0.0             1.0000              5
17182     0.0             1.0000              5
18083     0.0             1.0000              5
18789     0.0             1.0000              5
18803     1.0             1.0000              5

[131 rows x 3 columns]
No. of records with gender 0 in cluster 5 is 48
No. of records with gender 1 in cluster 5 is 33
No. of records with gender 2 in cluster 5 is 50

Records found in cluster 6 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
348       1.0             1.0000              6
3814      0.0             1.0000              6
8923      2.0             1.0000              6
8925      0.0             1.0000              6
8927      1.0             1.0000              6
8930      2.0             1.0000              6
8940      2.0             0.6815              6
8943      2.0             1.0000              6
8944      2.0             0.6641              6
8945      0.0             1.0000              6
8947      2.0             1.0000              6
8948      2.0             1.0000              6
8951      0.0             0.6752              6
8952      1.0             0.6734              6
8953      1.0             1.0000              6
8954      2.0             1.0000              6
8965      2.0             1.0000              6
8971      1.0             1.0000              6
8981      1.0             1.0000              6
8987      2.0             1.0000              6
8988      0.0             1.0000              6
8989      1.0             1.0000              6
8990      2.0             1.0000              6
8991      2.0             0.6728              6
8995      2.0             0.6761              6
8997      0.0             1.0000              6
15702     1.0             0.6739              6
16019     0.0             1.0000              6
16293     0.0             1.0000              6
16469     2.0             0.6755              6
No. of records with gender 0 in cluster 6 is 8
No. of records with gender 1 in cluster 6 is 8
No. of records with gender 2 in cluster 6 is 14

Records found in cluster 7 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
431       0.0             0.6631              7
4374      2.0             1.0000              7
4456      1.0             1.0000              7
4653      2.0             1.0000              7
4995      2.0             1.0000              7
5220      2.0             0.6650              7
5372      2.0             1.0000              7
5749      2.0             1.0000              7
6043      2.0             0.6787              7
6172      2.0             1.0000              7
6208      1.0             0.6543              7
6496      2.0             0.6716              7
6669      0.0             1.0000              7
7060      1.0             0.6890              7
7261      0.0             1.0000              7
7439      0.0             1.0000              7
7683      1.0             0.6699              7
7902      0.0             1.0000              7
8120      1.0             1.0000              7
8360      2.0             0.6854              7
8408      0.0             1.0000              7
9100      0.0             1.0000              7
9333      1.0             1.0000              7
10448     2.0             0.6544              7
10820     0.0             0.6635              7
12961     1.0             1.0000              7
13252     1.0             1.0000              7
13603     1.0             1.0000              7
14102     0.0             1.0000              7
14844     0.0             1.0000              7
15017     1.0             1.0000              7
No. of records with gender 0 in cluster 7 is 10
No. of records with gender 1 in cluster 7 is 10
No. of records with gender 2 in cluster 7 is 11

Records found in cluster 8 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
502      0.0             1.0000              8
578      1.0             1.0000              8
644      0.0             1.0000              8
771      0.0             1.0000              8
963      2.0             1.0000              8
1433     1.0             1.0000              8
1881     0.0             0.6691              8
2762     2.0             0.6670              8
2903     1.0             0.6763              8
3308     0.0             0.3364              8
3353     0.0             1.0000              8
3681     2.0             1.0000              8
3830     0.0             1.0000              8
4305     1.0             1.0000              8
5040     0.0             1.0000              8
5479     0.0             0.6857              8
5742     0.0             1.0000              8
6460     2.0             1.0000              8
6862     1.0             1.0000              8
8397     2.0             0.6634              8
8516     2.0             0.6839              8
8918     2.0             1.0000              8
No. of records with gender 0 in cluster 8 is 10
No. of records with gender 1 in cluster 8 is 5
No. of records with gender 2 in cluster 8 is 7

Records found in cluster 9 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
575      0.0             1.0000              9
1308     0.0             0.6479              9
2033     1.0             1.0000              9
2308     1.0             0.6774              9
3898     0.0             1.0000              9
5454     2.0             0.6774              9
5539     1.0             1.0000              9
5628     2.0             1.0000              9
5825     1.0             1.0000              9
5847     2.0             0.6717              9
6012     0.0             1.0000              9
6048     2.0             0.6796              9
6114     1.0             0.6620              9
6335     2.0             1.0000              9
6382     2.0             0.6842              9
6417     2.0             1.0000              9
7843     2.0             1.0000              9
8181     0.0             1.0000              9
8355     2.0             0.6778              9
8738     0.0             1.0000              9
No. of records with gender 0 in cluster 9 is 6
No. of records with gender 1 in cluster 9 is 5
No. of records with gender 2 in cluster 9 is 9

Records found in cluster 10 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
625      1.0             1.0000             10
7101     2.0             1.0000             10
7102     0.0             1.0000             10
7105     0.0             1.0000             10
7109     1.0             1.0000             10
7113     2.0             0.6718             10
7115     0.0             0.3451             10
7123     0.0             1.0000             10
7128     2.0             0.6585             10
7130     2.0             1.0000             10
7136     1.0             0.6835             10
7148     0.0             0.6750             10
7153     1.0             1.0000             10
7158     1.0             1.0000             10
7162     1.0             1.0000             10
7166     2.0             0.6635             10
7176     1.0             1.0000             10
7184     2.0             1.0000             10
No. of records with gender 0 in cluster 10 is 5
No. of records with gender 1 in cluster 10 is 7
No. of records with gender 2 in cluster 10 is 6

Records found in cluster 11 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
780       1.0             1.0000             11
2335      0.0             1.0000             11
4874      0.0             1.0000             11
5800      1.0             1.0000             11
5807      1.0             1.0000             11
5809      0.0             1.0000             11
5810      1.0             1.0000             11
5819      2.0             0.6667             11
5835      2.0             1.0000             11
5838      2.0             1.0000             11
5841      2.0             0.6645             11
5843      0.0             0.6658             11
5846      2.0             1.0000             11
5849      0.0             0.6792             11
5861      2.0             0.6808             11
5862      0.0             1.0000             11
5868      1.0             1.0000             11
5869      1.0             1.0000             11
5870      0.0             0.3441             11
5877      1.0             1.0000             11
5881      2.0             1.0000             11
5883      2.0             0.6725             11
5885      2.0             0.6640             11
5894      1.0             1.0000             11
5898      2.0             0.6675             11
8449      2.0             1.0000             11
10879     0.0             1.0000             11
18679     1.0             1.0000             11
No. of records with gender 0 in cluster 11 is 8
No. of records with gender 1 in cluster 11 is 9
No. of records with gender 2 in cluster 11 is 11

Records found in cluster 12 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
792       2.0             1.0000             12
7804      1.0             1.0000             12
7810      2.0             1.0000             12
7811      2.0             0.6341             12
7817      0.0             1.0000             12
7819      0.0             1.0000             12
7820      0.0             1.0000             12
7821      2.0             1.0000             12
7822      2.0             1.0000             12
7824      2.0             1.0000             12
7825      0.0             1.0000             12
7827      2.0             0.3472             12
7830      1.0             1.0000             12
7882      0.0             1.0000             12
7888      2.0             0.6506             12
7890      2.0             1.0000             12
7892      2.0             1.0000             12
7897      0.0             0.6803             12
7899      1.0             1.0000             12
8203      2.0             1.0000             12
8204      2.0             0.6746             12
8208      2.0             0.6844             12
8236      0.0             1.0000             12
8246      2.0             0.6598             12
8247      1.0             1.0000             12
8250      1.0             1.0000             12
8251      0.0             0.6624             12
8261      1.0             1.0000             12
8264      0.0             1.0000             12
8269      0.0             0.6774             12
8272      2.0             1.0000             12
8284      2.0             0.6691             12
8488      1.0             1.0000             12
13379     0.0             1.0000             12
No. of records with gender 0 in cluster 12 is 11
No. of records with gender 1 in cluster 12 is 7
No. of records with gender 2 in cluster 12 is 16

Records found in cluster 13 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1203      1.0             1.0000             13
1240      1.0             0.6889             13
2115      0.0             1.0000             13
2381      0.0             1.0000             13
3988      2.0             1.0000             13
5994      2.0             0.6611             13
7988      1.0             0.6734             13
8071      1.0             1.0000             13
10735     0.0             1.0000             13
10738     0.0             1.0000             13
11076     2.0             1.0000             13
11179     2.0             1.0000             13
11484     1.0             1.0000             13
11648     1.0             1.0000             13
11746     0.0             1.0000             13
12054     1.0             1.0000             13
13078     0.0             1.0000             13
14056     2.0             1.0000             13
15064     0.0             0.6534             13
15751     1.0             1.0000             13
15757     1.0             1.0000             13
16465     0.0             1.0000             13
16868     1.0             1.0000             13
17448     0.0             1.0000             13
18208     0.0             1.0000             13
18753     0.0             0.6678             13
No. of records with gender 0 in cluster 13 is 11
No. of records with gender 1 in cluster 13 is 10
No. of records with gender 2 in cluster 13 is 5

Records found in cluster 14 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
1273     0.0             1.0000             14
1605     2.0             1.0000             14
1761     2.0             1.0000             14
1845     1.0             1.0000             14
1987     1.0             1.0000             14
2274     0.0             1.0000             14
3961     0.0             1.0000             14
4092     0.0             0.3411             14
4424     2.0             1.0000             14
5218     2.0             1.0000             14
5336     1.0             1.0000             14
5445     0.0             1.0000             14
6262     2.0             1.0000             14
6289     1.0             1.0000             14
7003     1.0             1.0000             14
7118     2.0             1.0000             14
7431     1.0             1.0000             14
7540     0.0             0.6859             14
7791     1.0             1.0000             14
8142     2.0             1.0000             14
8601     2.0             0.6700             14
8693     0.0             1.0000             14
9023     1.0             0.6654             14
9265     1.0             1.0000             14
No. of records with gender 0 in cluster 14 is 7
No. of records with gender 1 in cluster 14 is 9
No. of records with gender 2 in cluster 14 is 8

Records found in cluster 15 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1474      1.0             0.3390             15
1582      1.0             1.0000             15
1940      2.0             0.6675             15
3133      0.0             1.0000             15
3252      0.0             1.0000             15
...       ...                ...            ...
14750     1.0             1.0000             15
15816     1.0             1.0000             15
17319     1.0             1.0000             15
17504     0.0             0.6567             15
18609     1.0             1.0000             15

[103 rows x 3 columns]
No. of records with gender 0 in cluster 15 is 34
No. of records with gender 1 in cluster 15 is 31
No. of records with gender 2 in cluster 15 is 38

Records found in cluster 16 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1646      0.0             0.6576             16
1868      0.0             1.0000             16
3803      0.0             1.0000             16
3962      2.0             1.0000             16
5400      0.0             1.0000             16
5401      2.0             0.6836             16
5407      2.0             0.6785             16
5408      2.0             1.0000             16
5409      0.0             1.0000             16
5412      2.0             1.0000             16
5427      1.0             1.0000             16
5429      0.0             1.0000             16
5433      2.0             0.6736             16
5434      1.0             1.0000             16
5436      2.0             0.6602             16
5442      1.0             0.3409             16
5443      2.0             0.6483             16
5447      1.0             1.0000             16
5448      2.0             0.6654             16
5449      1.0             1.0000             16
5456      0.0             1.0000             16
5457      2.0             0.6468             16
5466      2.0             1.0000             16
5470      2.0             1.0000             16
5471      0.0             1.0000             16
5472      0.0             1.0000             16
5480      1.0             1.0000             16
5485      2.0             1.0000             16
5486      1.0             1.0000             16
5487      1.0             0.6669             16
5490      2.0             1.0000             16
5491      2.0             1.0000             16
7364      1.0             1.0000             16
9547      2.0             1.0000             16
17851     2.0             0.6495             16
No. of records with gender 0 in cluster 16 is 9
No. of records with gender 1 in cluster 16 is 9
No. of records with gender 2 in cluster 16 is 17

Records found in cluster 17 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1673      1.0             1.0000             17
2702      1.0             1.0000             17
7600      0.0             1.0000             17
7601      2.0             0.6609             17
7611      0.0             0.6666             17
7613      2.0             1.0000             17
7614      2.0             0.6866             17
7615      2.0             1.0000             17
7620      1.0             0.6549             17
7621      1.0             1.0000             17
7622      2.0             1.0000             17
7626      0.0             1.0000             17
7627      0.0             0.7037             17
7629      2.0             1.0000             17
7652      0.0             0.6772             17
7655      1.0             1.0000             17
7662      0.0             1.0000             17
7665      2.0             0.6832             17
7667      0.0             1.0000             17
7669      2.0             1.0000             17
7670      1.0             1.0000             17
7672      2.0             1.0000             17
7679      1.0             1.0000             17
7680      1.0             1.0000             17
7681      1.0             1.0000             17
7686      2.0             1.0000             17
7694      2.0             1.0000             17
7697      1.0             1.0000             17
16509     1.0             1.0000             17
No. of records with gender 0 in cluster 17 is 7
No. of records with gender 1 in cluster 17 is 11
No. of records with gender 2 in cluster 17 is 11

Records found in cluster 18 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
2046     0.0             0.6531             18
3257     2.0             1.0000             18
7002     2.0             1.0000             18
7016     0.0             1.0000             18
7017     2.0             0.6646             18
7033     1.0             1.0000             18
7040     1.0             1.0000             18
7043     0.0             1.0000             18
7048     2.0             1.0000             18
7052     2.0             0.6595             18
7053     2.0             1.0000             18
7058     1.0             1.0000             18
7062     0.0             1.0000             18
7065     2.0             1.0000             18
7087     2.0             0.6671             18
7091     1.0             0.6642             18
7095     2.0             1.0000             18
7096     2.0             0.6782             18
7097     2.0             0.6788             18
8775     1.0             0.6609             18
No. of records with gender 0 in cluster 18 is 4
No. of records with gender 1 in cluster 18 is 5
No. of records with gender 2 in cluster 18 is 11

Records found in cluster 19 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
2135     2.0             1.0000             19
3978     1.0             1.0000             19
5034     0.0             1.0000             19
5208     0.0             1.0000             19
5364     2.0             1.0000             19
5513     0.0             1.0000             19
5677     1.0             1.0000             19
5817     0.0             1.0000             19
5929     1.0             1.0000             19
6085     0.0             1.0000             19
6257     2.0             0.6874             19
6679     1.0             1.0000             19
6819     2.0             0.6537             19
7029     0.0             1.0000             19
7121     0.0             1.0000             19
9044     1.0             1.0000             19
No. of records with gender 0 in cluster 19 is 7
No. of records with gender 1 in cluster 19 is 5
No. of records with gender 2 in cluster 19 is 4

Records found in cluster 20 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2138      1.0             1.0000             20
2145      0.0             1.0000             20
2146      1.0             1.0000             20
2147      1.0             1.0000             20
2148      1.0             0.3576             20
2156      0.0             1.0000             20
2166      1.0             1.0000             20
2168      0.0             0.6825             20
2169      1.0             1.0000             20
2171      1.0             1.0000             20
2172      0.0             1.0000             20
2182      2.0             1.0000             20
2185      0.0             1.0000             20
2186      0.0             0.3403             20
2187      1.0             1.0000             20
2188      2.0             0.6812             20
2189      0.0             0.6582             20
2191      0.0             1.0000             20
2194      1.0             1.0000             20
2196      1.0             1.0000             20
2204      1.0             0.6587             20
2205      0.0             0.6685             20
2206      1.0             0.6551             20
2207      1.0             1.0000             20
2210      1.0             1.0000             20
2216      1.0             0.6896             20
2217      1.0             0.6832             20
2220      1.0             1.0000             20
2223      2.0             1.0000             20
14626     0.0             1.0000             20
No. of records with gender 0 in cluster 20 is 10
No. of records with gender 1 in cluster 20 is 17
No. of records with gender 2 in cluster 20 is 3

Records found in cluster 21 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2240      0.0             1.0000             21
3269      1.0             0.3394             21
9001      1.0             1.0000             21
9020      0.0             1.0000             21
9028      1.0             0.6849             21
9033      0.0             1.0000             21
9038      1.0             0.6667             21
9043      1.0             1.0000             21
9046      2.0             0.6745             21
9050      1.0             0.6658             21
9052      2.0             0.6826             21
9054      1.0             1.0000             21
9055      1.0             1.0000             21
9056      2.0             1.0000             21
9061      0.0             1.0000             21
9064      2.0             1.0000             21
9069      2.0             0.6595             21
9070      0.0             1.0000             21
9072      1.0             0.6774             21
9076      2.0             1.0000             21
9079      0.0             1.0000             21
9080      1.0             0.6532             21
9081      0.0             1.0000             21
9082      0.0             1.0000             21
9083      0.0             1.0000             21
9089      1.0             1.0000             21
9952      2.0             0.3548             21
14813     1.0             0.6875             21
15564     0.0             1.0000             21
18157     1.0             1.0000             21
No. of records with gender 0 in cluster 21 is 10
No. of records with gender 1 in cluster 21 is 13
No. of records with gender 2 in cluster 21 is 7

Records found in cluster 22 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2512      1.0             1.0000             22
8502      2.0             1.0000             22
8505      1.0             1.0000             22
8506      2.0             1.0000             22
8507      0.0             1.0000             22
8520      2.0             0.6820             22
8525      0.0             1.0000             22
8528      1.0             1.0000             22
8531      2.0             0.6681             22
8535      2.0             1.0000             22
8540      2.0             1.0000             22
8541      2.0             1.0000             22
8542      2.0             1.0000             22
8546      1.0             1.0000             22
8553      2.0             1.0000             22
8554      0.0             1.0000             22
8557      0.0             1.0000             22
8562      0.0             1.0000             22
8563      1.0             1.0000             22
8564      2.0             1.0000             22
8565      0.0             0.6862             22
8568      1.0             1.0000             22
8580      2.0             1.0000             22
8583      0.0             1.0000             22
8586      0.0             0.6453             22
13204     1.0             1.0000             22
16912     1.0             0.6483             22
16945     0.0             1.0000             22
No. of records with gender 0 in cluster 22 is 9
No. of records with gender 1 in cluster 22 is 8
No. of records with gender 2 in cluster 22 is 11

Records found in cluster 23 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2730      1.0             1.0000             23
3086      0.0             1.0000             23
5506      2.0             0.6595             23
5511      1.0             1.0000             23
5524      0.0             0.6722             23
5541      0.0             1.0000             23
5542      2.0             1.0000             23
5544      1.0             0.3374             23
5546      1.0             1.0000             23
5552      2.0             1.0000             23
5558      2.0             1.0000             23
5559      2.0             0.6745             23
5560      1.0             1.0000             23
5561      0.0             1.0000             23
5563      2.0             1.0000             23
5564      2.0             1.0000             23
5566      1.0             0.6607             23
5570      2.0             1.0000             23
5572      1.0             1.0000             23
5579      1.0             1.0000             23
5583      2.0             1.0000             23
5588      0.0             0.6795             23
5597      0.0             1.0000             23
5598      0.0             1.0000             23
6067      1.0             1.0000             23
10803     0.0             1.0000             23
12037     0.0             1.0000             23
12202     0.0             1.0000             23
14307     2.0             0.6617             23
16093     0.0             0.3575             23
17031     1.0             1.0000             23
17498     1.0             1.0000             23
No. of records with gender 0 in cluster 23 is 11
No. of records with gender 1 in cluster 23 is 11
No. of records with gender 2 in cluster 23 is 10

Records found in cluster 24 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2928      2.0             0.6734             24
7703      1.0             1.0000             24
7705      1.0             1.0000             24
7727      2.0             1.0000             24
7738      2.0             1.0000             24
7743      0.0             1.0000             24
7745      1.0             1.0000             24
7746      2.0             1.0000             24
7747      2.0             0.6745             24
7748      2.0             1.0000             24
7751      2.0             1.0000             24
7752      1.0             0.6649             24
7757      2.0             1.0000             24
7759      2.0             1.0000             24
7760      2.0             1.0000             24
7761      1.0             1.0000             24
7793      0.0             0.6691             24
7797      2.0             0.6600             24
10622     1.0             0.6692             24
No. of records with gender 0 in cluster 24 is 2
No. of records with gender 1 in cluster 24 is 6
No. of records with gender 2 in cluster 24 is 11

Records found in cluster 25 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
3581     0.0             1.0000             25
3705     2.0             0.6581             25
3809     2.0             1.0000             25
3906     1.0             0.6422             25
4041     0.0             1.0000             25
4156     1.0             1.0000             25
4272     2.0             1.0000             25
4341     0.0             1.0000             25
4410     2.0             1.0000             25
4508     1.0             1.0000             25
4631     2.0             1.0000             25
4736     2.0             1.0000             25
4840     2.0             1.0000             25
5305     1.0             1.0000             25
No. of records with gender 0 in cluster 25 is 3
No. of records with gender 1 in cluster 25 is 4
No. of records with gender 2 in cluster 25 is 7

Records found in cluster 26 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
3744     0.0             0.6440             26
3927     0.0             1.0000             26
3994     1.0             1.0000             26
4057     2.0             0.3516             26
4300     2.0             0.6736             26
4398     1.0             1.0000             26
4470     2.0             0.6602             26
4544     0.0             1.0000             26
4640     2.0             1.0000             26
4800     2.0             0.6575             26
4883     2.0             1.0000             26
5043     1.0             1.0000             26
5238     1.0             1.0000             26
5325     1.0             0.6645             26
5515     2.0             1.0000             26
5659     1.0             1.0000             26
5978     2.0             1.0000             26
6188     2.0             0.6748             26
6440     2.0             1.0000             26
6562     0.0             1.0000             26
6671     2.0             1.0000             26
6749     1.0             1.0000             26
6826     2.0             0.6933             26
7050     0.0             0.6736             26
No. of records with gender 0 in cluster 26 is 5
No. of records with gender 1 in cluster 26 is 7
No. of records with gender 2 in cluster 26 is 12

Records found in cluster 27 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
4093      2.0             1.0000             27
4485      2.0             1.0000             27
4893      2.0             1.0000             27
6095      2.0             0.6602             27
6412      2.0             1.0000             27
7079      2.0             1.0000             27
8501      0.0             1.0000             27
8968      2.0             1.0000             27
9965      0.0             1.0000             27
10058     2.0             1.0000             27
10070     1.0             1.0000             27
10084     0.0             1.0000             27
10092     1.0             1.0000             27
10102     2.0             1.0000             27
10116     2.0             1.0000             27
10131     0.0             1.0000             27
10143     2.0             1.0000             27
10167     1.0             0.3495             27
10256     0.0             1.0000             27
10658     1.0             1.0000             27
11280     0.0             1.0000             27
14155     0.0             1.0000             27
14888     1.0             1.0000             27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 5
No. of records with gender 2 in cluster 27 is 11

Records found in cluster 28 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
4606      0.0             1.0000             28
4608      0.0             0.6618             28
4615      2.0             0.6590             28
4621      1.0             1.0000             28
4627      2.0             1.0000             28
4643      0.0             1.0000             28
4657      2.0             0.6751             28
4664      1.0             1.0000             28
4674      2.0             1.0000             28
4675      2.0             1.0000             28
4685      2.0             1.0000             28
4690      0.0             0.6763             28
4691      0.0             1.0000             28
4710      2.0             1.0000             28
4712      0.0             1.0000             28
4717      2.0             1.0000             28
4720      2.0             1.0000             28
4722      2.0             0.6686             28
4731      1.0             1.0000             28
4743      2.0             1.0000             28
4746      1.0             1.0000             28
4772      2.0             1.0000             28
4778      1.0             0.3592             28
4780      2.0             1.0000             28
4781      2.0             0.6475             28
4782      1.0             0.6697             28
4783      2.0             1.0000             28
4785      2.0             0.6811             28
4789      2.0             1.0000             28
4790      1.0             1.0000             28
4798      2.0             0.6736             28
4799      0.0             1.0000             28
6627      2.0             1.0000             28
6629      1.0             1.0000             28
6633      0.0             1.0000             28
6650      2.0             1.0000             28
6654      1.0             1.0000             28
6660      2.0             1.0000             28
6664      2.0             1.0000             28
6665      1.0             1.0000             28
6668      0.0             1.0000             28
6670      0.0             1.0000             28
6678      1.0             1.0000             28
6685      2.0             1.0000             28
6688      2.0             1.0000             28
11370     2.0             1.0000             28
No. of records with gender 0 in cluster 28 is 10
No. of records with gender 1 in cluster 28 is 11
No. of records with gender 2 in cluster 28 is 25

Records found in cluster 29 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
4906      2.0             0.6681             29
4908      0.0             1.0000             29
4909      2.0             1.0000             29
4910      0.0             1.0000             29
4912      1.0             1.0000             29
4917      1.0             0.6571             29
4918      0.0             1.0000             29
4923      2.0             1.0000             29
4924      2.0             0.6585             29
4929      1.0             1.0000             29
4934      1.0             0.6571             29
4937      2.0             1.0000             29
4944      1.0             0.6711             29
4949      2.0             1.0000             29
4950      1.0             1.0000             29
4951      1.0             1.0000             29
4961      0.0             1.0000             29
4962      1.0             1.0000             29
4965      2.0             0.6695             29
4967      0.0             1.0000             29
4968      1.0             1.0000             29
4970      0.0             1.0000             29
4973      1.0             1.0000             29
4990      1.0             1.0000             29
4997      2.0             0.6957             29
4999      2.0             0.6884             29
13476     2.0             0.6742             29
16183     0.0             1.0000             29
18336     1.0             1.0000             29
No. of records with gender 0 in cluster 29 is 7
No. of records with gender 1 in cluster 29 is 12
No. of records with gender 2 in cluster 29 is 10

Records found in cluster 30 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5002      1.0             1.0000             30
5007      0.0             1.0000             30
5014      1.0             1.0000             30
5017      0.0             1.0000             30
5021      1.0             1.0000             30
5030      0.0             1.0000             30
5049      1.0             0.6787             30
5065      1.0             1.0000             30
5069      2.0             0.6832             30
5075      2.0             0.6692             30
5084      2.0             1.0000             30
5086      0.0             1.0000             30
5088      2.0             1.0000             30
5090      2.0             1.0000             30
5094      1.0             1.0000             30
5095      2.0             0.6848             30
11521     2.0             0.6792             30
17014     2.0             1.0000             30
No. of records with gender 0 in cluster 30 is 4
No. of records with gender 1 in cluster 30 is 6
No. of records with gender 2 in cluster 30 is 8

Records found in cluster 31 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
5100     0.0             1.0000             31
5120     2.0             1.0000             31
5123     2.0             0.6741             31
5136     0.0             1.0000             31
5149     2.0             1.0000             31
5153     2.0             0.6735             31
5156     2.0             0.6516             31
5161     0.0             1.0000             31
5170     2.0             0.6606             31
5175     2.0             1.0000             31
5176     1.0             1.0000             31
5180     1.0             1.0000             31
5181     2.0             1.0000             31
5182     0.0             0.6801             31
5185     2.0             0.6822             31
5187     0.0             1.0000             31
5192     2.0             0.6835             31
No. of records with gender 0 in cluster 31 is 5
No. of records with gender 1 in cluster 31 is 2
No. of records with gender 2 in cluster 31 is 10

Records found in cluster 32 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5200      0.0             1.0000             32
5203      1.0             1.0000             32
5205      1.0             0.6748             32
5209      1.0             1.0000             32
5211      0.0             0.6738             32
5217      0.0             1.0000             32
5227      1.0             1.0000             32
5232      1.0             1.0000             32
5234      1.0             1.0000             32
5242      1.0             1.0000             32
5256      2.0             0.6475             32
5262      0.0             0.6457             32
5264      0.0             1.0000             32
5265      1.0             1.0000             32
5266      0.0             1.0000             32
5270      2.0             1.0000             32
5271      2.0             0.6812             32
5272      2.0             1.0000             32
5284      1.0             0.6815             32
5289      0.0             1.0000             32
5291      2.0             0.6333             32
5297      0.0             1.0000             32
10620     1.0             1.0000             32
13921     2.0             0.6771             32
18824     2.0             1.0000             32
No. of records with gender 0 in cluster 32 is 8
No. of records with gender 1 in cluster 32 is 10
No. of records with gender 2 in cluster 32 is 7

Records found in cluster 33 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5705      1.0             1.0000             33
5709      2.0             0.6860             33
5711      2.0             1.0000             33
5712      1.0             1.0000             33
5726      2.0             0.6735             33
5746      2.0             0.3410             33
5752      2.0             0.6747             33
5754      1.0             1.0000             33
5757      1.0             1.0000             33
5766      2.0             1.0000             33
5767      2.0             1.0000             33
5768      1.0             0.3631             33
5770      2.0             1.0000             33
5773      2.0             0.6769             33
5777      2.0             0.6638             33
5782      1.0             1.0000             33
5786      2.0             1.0000             33
5790      0.0             1.0000             33
5792      2.0             1.0000             33
5793      2.0             0.6675             33
5794      2.0             1.0000             33
5798      2.0             1.0000             33
10582     2.0             0.6383             33
11935     2.0             1.0000             33
15021     0.0             1.0000             33
16688     0.0             1.0000             33
No. of records with gender 0 in cluster 33 is 3
No. of records with gender 1 in cluster 33 is 6
No. of records with gender 2 in cluster 33 is 17

Records found in cluster 34 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5901      2.0             1.0000             34
5902      0.0             0.6462             34
5904      0.0             1.0000             34
5910      0.0             0.6787             34
5914      2.0             1.0000             34
5930      0.0             0.6512             34
5932      0.0             1.0000             34
5934      2.0             1.0000             34
5935      2.0             1.0000             34
5936      2.0             0.6836             34
5945      2.0             1.0000             34
5952      0.0             1.0000             34
5954      0.0             1.0000             34
5956      2.0             1.0000             34
5961      2.0             1.0000             34
5962      1.0             1.0000             34
5963      0.0             1.0000             34
5964      1.0             1.0000             34
5965      2.0             0.6764             34
5966      2.0             0.6842             34
5973      2.0             0.6509             34
5986      0.0             1.0000             34
5989      2.0             1.0000             34
5990      0.0             0.6713             34
16757     1.0             1.0000             34
No. of records with gender 0 in cluster 34 is 10
No. of records with gender 1 in cluster 34 is 3
No. of records with gender 2 in cluster 34 is 12

Records found in cluster 35 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
6101     1.0             0.6543             35
6102     0.0             0.6699             35
6103     0.0             1.0000             35
6109     0.0             1.0000             35
6129     2.0             0.6778             35
6131     0.0             1.0000             35
6133     0.0             0.6655             35
6134     0.0             1.0000             35
6147     2.0             0.6540             35
6149     0.0             1.0000             35
6151     2.0             0.6642             35
6156     2.0             1.0000             35
6158     1.0             1.0000             35
6164     1.0             1.0000             35
6167     2.0             0.6742             35
6169     2.0             0.6866             35
6178     1.0             1.0000             35
6180     1.0             1.0000             35
6190     0.0             1.0000             35
6192     2.0             0.6652             35
6197     1.0             0.6513             35
No. of records with gender 0 in cluster 35 is 8
No. of records with gender 1 in cluster 35 is 6
No. of records with gender 2 in cluster 35 is 7

Records found in cluster 36 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
6502      0.0             1.0000             36
6505      2.0             1.0000             36
6516      0.0             1.0000             36
6521      2.0             1.0000             36
6523      1.0             1.0000             36
6540      2.0             1.0000             36
6549      2.0             1.0000             36
6555      2.0             1.0000             36
6556      1.0             1.0000             36
6559      0.0             1.0000             36
6560      0.0             1.0000             36
6565      2.0             0.6534             36
6567      2.0             1.0000             36
6569      2.0             1.0000             36
6575      1.0             1.0000             36
6576      2.0             1.0000             36
6577      2.0             1.0000             36
6579      2.0             0.6762             36
6580      2.0             1.0000             36
6581      1.0             1.0000             36
6583      2.0             1.0000             36
6596      1.0             1.0000             36
6599      2.0             1.0000             36
12899     2.0             1.0000             36
No. of records with gender 0 in cluster 36 is 4
No. of records with gender 1 in cluster 36 is 5
No. of records with gender 2 in cluster 36 is 15

Records found in cluster 37 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
6722     1.0             1.0000             37
6726     0.0             1.0000             37
6728     2.0             0.6634             37
6730     2.0             0.6681             37
6732     1.0             0.6882             37
6742     2.0             0.6625             37
6758     0.0             0.3469             37
6759     1.0             0.6543             37
6772     2.0             1.0000             37
6786     2.0             0.6694             37
6787     2.0             1.0000             37
6788     2.0             1.0000             37
6789     2.0             1.0000             37
6793     1.0             0.6699             37
6795     2.0             0.6741             37
No. of records with gender 0 in cluster 37 is 2
No. of records with gender 1 in cluster 37 is 4
No. of records with gender 2 in cluster 37 is 9

Records found in cluster 38 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7210     0.0             0.6617             38
7215     2.0             1.0000             38
7216     2.0             0.6921             38
7228     2.0             0.6766             38
7230     1.0             1.0000             38
7234     0.0             1.0000             38
7250     2.0             1.0000             38
7258     1.0             0.6902             38
7259     0.0             1.0000             38
7260     2.0             1.0000             38
7266     2.0             1.0000             38
7273     1.0             1.0000             38
7277     0.0             0.3487             38
7284     0.0             0.6661             38
7288     2.0             1.0000             38
7297     2.0             0.6853             38
No. of records with gender 0 in cluster 38 is 5
No. of records with gender 1 in cluster 38 is 3
No. of records with gender 2 in cluster 38 is 8

Records found in cluster 39 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
7289      0.0             1.0000             39
12796     1.0             1.0000             39
13303     1.0             1.0000             39
13417     1.0             1.0000             39
13502     1.0             1.0000             39
13716     1.0             0.6830             39
13901     2.0             0.6611             39
14140     0.0             0.6645             39
14214     2.0             1.0000             39
14269     2.0             0.6868             39
14337     1.0             1.0000             39
14412     1.0             1.0000             39
14483     0.0             1.0000             39
14645     1.0             1.0000             39
15443     2.0             1.0000             39
15534     0.0             1.0000             39
15807     0.0             1.0000             39
15916     1.0             1.0000             39
16188     1.0             1.0000             39
16418     2.0             1.0000             39
16672     1.0             1.0000             39
16725     1.0             1.0000             39
17269     0.0             1.0000             39
17351     1.0             0.6556             39
17442     1.0             1.0000             39
17842     0.0             1.0000             39
18412     2.0             0.6690             39
18510     1.0             1.0000             39
18731     1.0             1.0000             39
18738     2.0             1.0000             39
No. of records with gender 0 in cluster 39 is 7
No. of records with gender 1 in cluster 39 is 16
No. of records with gender 2 in cluster 39 is 7

Records found in cluster 40 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
7381      2.0             1.0000             40
7470      1.0             0.6810             40
7542      0.0             1.0000             40
7616      2.0             0.6675             40
7675      2.0             1.0000             40
...       ...                ...            ...
15207     1.0             1.0000             40
15391     2.0             1.0000             40
15439     2.0             1.0000             40
15622     2.0             1.0000             40
17122     2.0             0.6583             40

[98 rows x 3 columns]
No. of records with gender 0 in cluster 40 is 25
No. of records with gender 1 in cluster 40 is 38
No. of records with gender 2 in cluster 40 is 35

Records found in cluster 41 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7416     2.0             1.0000             41
7417     1.0             1.0000             41
7418     2.0             1.0000             41
7421     2.0             0.6802             41
7429     2.0             1.0000             41
7430     2.0             0.6812             41
7434     2.0             1.0000             41
7440     0.0             1.0000             41
7441     1.0             1.0000             41
7442     2.0             1.0000             41
7448     0.0             1.0000             41
7458     2.0             1.0000             41
7459     2.0             1.0000             41
7496     2.0             0.6703             41
7497     0.0             0.6799             41
No. of records with gender 0 in cluster 41 is 3
No. of records with gender 1 in cluster 41 is 2
No. of records with gender 2 in cluster 41 is 10

Records found in cluster 42 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7900     2.0             1.0000             42
7908     2.0             1.0000             42
7910     2.0             1.0000             42
7914     2.0             1.0000             42
7933     1.0             1.0000             42
7953     0.0             1.0000             42
7956     1.0             1.0000             42
7958     1.0             1.0000             42
7959     0.0             0.6823             42
7963     2.0             1.0000             42
7964     2.0             1.0000             42
7966     0.0             0.6607             42
7967     2.0             0.6737             42
7968     2.0             1.0000             42
7973     0.0             1.0000             42
7975     0.0             1.0000             42
7976     0.0             1.0000             42
7977     2.0             0.6739             42
7980     2.0             1.0000             42
7987     0.0             1.0000             42
7991     1.0             1.0000             42
7999     2.0             0.6726             42
No. of records with gender 0 in cluster 42 is 7
No. of records with gender 1 in cluster 42 is 4
No. of records with gender 2 in cluster 42 is 11

Records found in cluster 43 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8024      2.0             1.0000             43
8033      0.0             0.6701             43
8039      1.0             1.0000             43
8046      2.0             1.0000             43
8050      2.0             1.0000             43
8052      0.0             0.7050             43
8055      0.0             1.0000             43
8057      1.0             1.0000             43
8058      2.0             1.0000             43
8059      1.0             1.0000             43
8062      1.0             1.0000             43
8063      1.0             1.0000             43
8065      1.0             0.6688             43
8067      2.0             0.3442             43
8068      1.0             1.0000             43
8070      1.0             0.6698             43
8078      0.0             1.0000             43
8081      2.0             1.0000             43
8085      0.0             1.0000             43
8097      0.0             1.0000             43
16604     1.0             1.0000             43
No. of records with gender 0 in cluster 43 is 6
No. of records with gender 1 in cluster 43 is 9
No. of records with gender 2 in cluster 43 is 6

Records found in cluster 44 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
8109     1.0             1.0000             44
8112     0.0             1.0000             44
8113     2.0             0.6675             44
8116     2.0             0.6611             44
8118     1.0             1.0000             44
8122     2.0             0.6623             44
8123     2.0             0.6605             44
8128     0.0             1.0000             44
8132     2.0             0.6665             44
8146     1.0             1.0000             44
8159     2.0             1.0000             44
8165     0.0             1.0000             44
8176     1.0             1.0000             44
8177     2.0             1.0000             44
8178     2.0             1.0000             44
8185     2.0             1.0000             44
8190     2.0             0.6735             44
8191     1.0             0.3568             44
8192     2.0             0.6726             44
8199     2.0             1.0000             44
No. of records with gender 0 in cluster 44 is 3
No. of records with gender 1 in cluster 44 is 5
No. of records with gender 2 in cluster 44 is 12

Records found in cluster 45 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
8313     0.0             1.0000             45
8322     1.0             1.0000             45
8327     0.0             0.6763             45
8331     2.0             0.6716             45
8333     2.0             1.0000             45
8337     1.0             1.0000             45
8338     0.0             1.0000             45
8339     0.0             1.0000             45
8340     2.0             0.6707             45
8341     1.0             0.6699             45
8353     2.0             0.6650             45
8356     1.0             0.6517             45
8358     2.0             0.6965             45
8384     0.0             1.0000             45
8385     1.0             1.0000             45
8391     0.0             1.0000             45
No. of records with gender 0 in cluster 45 is 6
No. of records with gender 1 in cluster 45 is 5
No. of records with gender 2 in cluster 45 is 5

Records found in cluster 46 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
8401     0.0             0.6732             46
8402     2.0             0.6767             46
8403     2.0             0.6575             46
8407     0.0             0.6763             46
8411     1.0             1.0000             46
8412     1.0             0.6900             46
8429     1.0             1.0000             46
8460     2.0             0.6828             46
8466     0.0             1.0000             46
8470     1.0             1.0000             46
8478     0.0             1.0000             46
8479     2.0             0.3625             46
8487     0.0             0.6806             46
8489     0.0             1.0000             46
8496     0.0             1.0000             46
No. of records with gender 0 in cluster 46 is 7
No. of records with gender 1 in cluster 46 is 4
No. of records with gender 2 in cluster 46 is 4

Records found in cluster 47 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
8607     2.0             0.6659             47
8613     2.0             1.0000             47
8616     2.0             1.0000             47
8617     2.0             0.6774             47
8619     0.0             0.6647             47
8620     2.0             0.6975             47
8622     0.0             0.6634             47
8623     2.0             0.6778             47
8624     1.0             1.0000             47
8627     2.0             0.6829             47
8632     2.0             1.0000             47
8638     0.0             1.0000             47
8642     2.0             0.6688             47
8645     2.0             0.6778             47
8647     2.0             1.0000             47
8675     2.0             1.0000             47
8676     1.0             0.6602             47
8677     0.0             0.6772             47
8679     2.0             1.0000             47
8680     2.0             1.0000             47
8681     0.0             0.6507             47
8688     2.0             0.3354             47
8690     2.0             1.0000             47
8691     2.0             0.3595             47
8694     2.0             0.6736             47
8699     0.0             1.0000             47
8749     0.0             0.6548             47
No. of records with gender 0 in cluster 47 is 7
No. of records with gender 1 in cluster 47 is 2
No. of records with gender 2 in cluster 47 is 18

Records found in cluster 48 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8701      1.0             1.0000             48
8711      2.0             1.0000             48
8728      0.0             1.0000             48
8732      2.0             0.6946             48
8739      0.0             1.0000             48
8744      2.0             1.0000             48
8746      2.0             0.6916             48
8764      2.0             0.6674             48
8765      1.0             0.6611             48
8767      0.0             1.0000             48
8769      2.0             1.0000             48
8772      0.0             0.6732             48
8777      0.0             1.0000             48
8779      2.0             1.0000             48
8782      1.0             1.0000             48
8783      2.0             1.0000             48
8784      2.0             1.0000             48
9648      0.0             1.0000             48
10111     2.0             1.0000             48
10551     2.0             0.6362             48
10903     1.0             1.0000             48
11265     1.0             1.0000             48
11650     0.0             1.0000             48
12295     0.0             1.0000             48
12731     2.0             1.0000             48
15770     0.0             0.6808             48
16201     2.0             1.0000             48
No. of records with gender 0 in cluster 48 is 9
No. of records with gender 1 in cluster 48 is 5
No. of records with gender 2 in cluster 48 is 13

Records found in cluster 49 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8804      2.0             0.6561             49
8834      2.0             1.0000             49
8843      0.0             0.3571             49
8844      2.0             1.0000             49
8849      0.0             0.6906             49
8852      0.0             1.0000             49
8854      0.0             1.0000             49
8855      1.0             0.6440             49
8859      2.0             1.0000             49
8864      0.0             0.3421             49
8865      1.0             1.0000             49
8873      0.0             1.0000             49
8874      1.0             1.0000             49
8878      2.0             0.6640             49
8881      0.0             1.0000             49
8884      1.0             0.6612             49
8886      2.0             0.3536             49
17100     1.0             1.0000             49
No. of records with gender 0 in cluster 49 is 7
No. of records with gender 1 in cluster 49 is 5
No. of records with gender 2 in cluster 49 is 6

Records found in cluster 50 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
9105     2.0             0.6468             50
9109     0.0             0.6553             50
9112     1.0             1.0000             50
9113     0.0             1.0000             50
9115     2.0             0.6771             50
9118     2.0             0.6712             50
9123     2.0             1.0000             50
9125     2.0             1.0000             50
9130     2.0             0.6741             50
9136     2.0             1.0000             50
9144     2.0             1.0000             50
9150     1.0             1.0000             50
9151     1.0             0.6453             50
9152     0.0             1.0000             50
9165     0.0             1.0000             50
9166     2.0             1.0000             50
9178     2.0             0.6698             50
9190     1.0             1.0000             50
9194     2.0             1.0000             50
9195     1.0             1.0000             50
No. of records with gender 0 in cluster 50 is 4
No. of records with gender 1 in cluster 50 is 5
No. of records with gender 2 in cluster 50 is 11

Records found in cluster 51 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9206      2.0             0.3398             51
9207      2.0             1.0000             51
9212      0.0             1.0000             51
9215      1.0             0.6818             51
9216      2.0             0.6519             51
9217      2.0             0.3376             51
9220      2.0             1.0000             51
9221      2.0             1.0000             51
9225      2.0             1.0000             51
9228      0.0             1.0000             51
9243      0.0             0.3506             51
9249      1.0             0.3542             51
9253      2.0             1.0000             51
9278      1.0             1.0000             51
9280      1.0             1.0000             51
9283      2.0             0.6659             51
9289      2.0             1.0000             51
9293      0.0             1.0000             51
9294      0.0             1.0000             51
11308     2.0             0.6412             51
No. of records with gender 0 in cluster 51 is 5
No. of records with gender 1 in cluster 51 is 4
No. of records with gender 2 in cluster 51 is 11

Records found in cluster 52 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9515      0.0             0.6648             52
10396     1.0             1.0000             52
10608     1.0             1.0000             52
10796     0.0             0.6912             52
10981     0.0             1.0000             52
11477     2.0             1.0000             52
11770     2.0             1.0000             52
12451     2.0             1.0000             52
12803     1.0             0.6667             52
12996     1.0             1.0000             52
13263     2.0             0.6743             52
13436     0.0             1.0000             52
14141     0.0             1.0000             52
14290     0.0             1.0000             52
14473     0.0             1.0000             52
14878     2.0             0.6502             52
15088     0.0             0.6581             52
15727     2.0             1.0000             52
16605     0.0             0.6578             52
16973     0.0             1.0000             52
17197     1.0             1.0000             52
17330     0.0             1.0000             52
17728     1.0             0.6702             52
18071     2.0             1.0000             52
18531     2.0             1.0000             52
No. of records with gender 0 in cluster 52 is 11
No. of records with gender 1 in cluster 52 is 6
No. of records with gender 2 in cluster 52 is 8

Records found in cluster 53 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9856      2.0             1.0000             53
10150     1.0             1.0000             53
10237     2.0             1.0000             53
10471     1.0             1.0000             53
10633     2.0             0.6545             53
10849     2.0             1.0000             53
10964     2.0             1.0000             53
11050     0.0             1.0000             53
11251     1.0             0.6715             53
11356     2.0             1.0000             53
11429     2.0             1.0000             53
11653     0.0             1.0000             53
11767     2.0             1.0000             53
11842     1.0             1.0000             53
11930     1.0             1.0000             53
12045     1.0             1.0000             53
12284     1.0             1.0000             53
12397     0.0             1.0000             53
12507     2.0             1.0000             53
12659     2.0             1.0000             53
12754     2.0             0.6615             53
No. of records with gender 0 in cluster 53 is 3
No. of records with gender 1 in cluster 53 is 7
No. of records with gender 2 in cluster 53 is 11

Records found in cluster 54 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
10812     1.0             0.6827             54
12073     1.0             1.0000             54
13106     1.0             0.6574             54
14855     2.0             1.0000             54
15950     2.0             1.0000             54
16388     2.0             1.0000             54
16854     2.0             1.0000             54
17041     1.0             1.0000             54
17154     1.0             1.0000             54
17297     0.0             1.0000             54
17565     1.0             1.0000             54
17677     1.0             1.0000             54
17868     2.0             0.3354             54
18092     0.0             1.0000             54
18246     1.0             1.0000             54
18302     1.0             1.0000             54
18399     0.0             1.0000             54
18527     1.0             1.0000             54
18646     0.0             1.0000             54
18759     0.0             0.6386             54
No. of records with gender 0 in cluster 54 is 5
No. of records with gender 1 in cluster 54 is 10
No. of records with gender 2 in cluster 54 is 5
Records classified as noise
       gender  gender:confidence  Cluster_Label
599       1.0             1.0000             -1
635       1.0             1.0000             -1
1268      2.0             1.0000             -1
1367      1.0             1.0000             -1
1544      0.0             1.0000             -1
2154      1.0             0.6561             -1
2243      2.0             1.0000             -1
2382      1.0             1.0000             -1
2682      1.0             0.6473             -1
2897      2.0             1.0000             -1
3341      1.0             1.0000             -1
3360      1.0             1.0000             -1
3526      1.0             1.0000             -1
3938      2.0             0.6545             -1
4051      2.0             1.0000             -1
4650      2.0             0.3571             -1
5424      0.0             1.0000             -1
5548      2.0             1.0000             -1
6140      2.0             0.6679             -1
6313      1.0             1.0000             -1
6616      1.0             1.0000             -1
6620      2.0             1.0000             -1
7107      2.0             0.6865             -1
7610      2.0             0.6578             -1
7651      0.0             0.6637             -1
8509      2.0             0.6731             -1
8579      2.0             1.0000             -1
8798      1.0             1.0000             -1
8836      0.0             0.6645             -1
9305      2.0             0.6606             -1
11119     1.0             1.0000             -1
11727     2.0             1.0000             -1
12333     1.0             1.0000             -1
12992     0.0             1.0000             -1
13486     2.0             1.0000             -1
14046     0.0             1.0000             -1
14958     2.0             1.0000             -1
15597     1.0             0.3362             -1
16706     0.0             1.0000             -1
17186     1.0             1.0000             -1
17599     0.0             0.6654             -1
18270     0.0             1.0000             -1

---- VISUALIZE THE METRIC EVALUATION ----
No description has been provided for this image
No description has been provided for this image

REGRESSION¶

In [4]:
# =============================== REGRESSION ======================================
print()
print()
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)

print()
print("=" * 50)
print('Boosted Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)

# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# FEATURE IMPORTANCE
print()
print("Performing feature importance analysis...")
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
# print("desc_ column indices:", desc_columns)
# print("text_ column indices:", text_columns)
# print("desc_ array:\n", desc_array)
# print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
    col_name = df_preprocessed_reg.columns[i]
    new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)

# Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()


# preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset

# filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]

# plotting these columns

def scatterplot_mistaken_points(misclassified_df, X_train, model):
    # Edit misclassified_df to include 'in X_train'
    misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
    # Create subsets for the two plots
    df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
    df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
    # Set up the matplotlib figure with subplots
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    # Set the main title
    fig.suptitle(f'{model}\nGender Confidence of "Mistaken" Records', fontsize=16)
    # Plot 1: Points in X_train
    sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
    axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
                 [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[0].set_xlabel('Dataset')
    axes[0].set_ylabel('Predicted')
    axes[0].set_title(f'Training Set\nSample Size: {len(df_in_X_train)}')
    # Plot 2: Points not in X_train
    sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
    axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
                 [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[1].set_xlabel('Dataset')
    axes[1].set_ylabel('Predicted')
    axes[1].set_title(f'Not Training Set\nSample Size: {len(df_not_in_X_train)}')
    plt.tight_layout()
    plt.show()

def scatter_plot(y, y_tot_pred, model):
    # Plotting more results results
    plt.figure(figsize=(10, 8))
    plt.scatter(y, y_tot_pred, alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylabel('Predicted', fontsize=12)
    plt.suptitle(model, fontsize=16)
    plt.title('Gender Confidence Comparison', fontsize=14)
    plt.show()

scatterplot_mistaken_points(misclassified_df, X_train, "Boosted Regression Tree with Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree with Vectorised Text/Desc Features")

# ==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
df_preprocessed_non_text2 = df_preprocessed_non_text.copy()
print(df_preprocessed_non_text)

print()
print("=" * 50)
print('Boosted Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)

boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# Get feature importances and plot from the model
print()
print("Performing feature importance analysis...")
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
    'Feature': column_names,
    'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()

# adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset

# Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text, "Boosted Regression Tree without Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree without Vectorised Text/Desc Features")

# ====================================Analyzing with a linear regression (Least Squares Implementation)====================

print()
print("=" * 50)
print('Linear Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)

X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

# run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree with Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred


# identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]

misclassified_df_lin_reg = misclassified_df.copy()

scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree with Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree with Vectorised Text/Desc Features")

#================================Lin reg without text=======================================================
#================================Linear regression without text features============================
print()
print("=" * 50)
print('Linear Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)

X_train_lin = sm.add_constant(X_train_non_text)
X_test_lin = sm.add_constant(X_test_non_text)
df_preprocessed_lin = sm.add_constant(df_preprocessed_non_text2)
model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree without Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

#final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred


#identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]

scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree without Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree without Vectorised Text/Desc Features")



# ================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]

scatterplot_mistaken_points(common_df, X_train_lin, "Boosted and Linear Regression Trees (Intersection) with Vectorised Text/Desc Features")


==================================================
Boosted Regression Tree with Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0266
Mean Squared Error (Test): 0.0290
Mean Squared Error (Total): 0.0280
No description has been provided for this image
Performing feature importance analysis...
       desc      text  favorites_per_day  retweets_per_day  tweets_per_day  \
0  0.308771  0.364314           0.021232               0.0        0.121167   

   profile_created_year  tweet_created_year    link_R    link_G    link_B  \
0              0.155415                 0.0  0.000336  0.011339  0.000434   

   sidebar_R  sidebar_G  sidebar_B  
0   0.005375   0.006886    0.00473  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
       favorites_per_day  retweets_per_day  tweets_per_day  \
0               0.000000          0.000000       28.149163   
1               0.015554          0.000000        1.708829   
2               2.147321          0.000279        1.567243   
3               0.036207          0.000000        0.303459   
4               9.794751          0.000000        8.257743   
...                  ...               ...             ...   
18831           0.090609          0.000000        0.234923   
18832           0.568809          0.000000        3.060887   
18833           0.011364          0.000000        6.004318   
18834          16.333103          0.000000       12.934948   
18835           0.878510          0.000000        0.766728   

       profile_created_year  tweet_created_year  link_R  link_G  link_B  \
0                      2013                2015       8     194     194   
1                      2012                2015       0     132     180   
2                      2014                2015     171     184     194   
3                      2009                2015       0     132     180   
4                      2014                2015      59     148     217   
...                     ...                 ...     ...     ...     ...   
18831                  2015                2015       0     132     180   
18832                  2012                2015     207     185      41   
18833                  2012                2015       0     132     180   
18834                  2012                2015     146     102     204   
18835                  2014                2015       0     132     180   

       sidebar_R  sidebar_G  sidebar_B  
0            255        255        255  
1            192        222        237  
2            192        222        237  
3            192        222        237  
4              0          0          0  
...          ...        ...        ...  
18831        192        222        237  
18832          0          0          0  
18833        192        222        237  
18834          0          0          0  
18835        192        222        237  

[18836 rows x 11 columns]

==================================================
Boosted Regression Tree without Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0275
Mean Squared Error (Test): 0.0292
Mean Squared Error (Total): 0.0280
No description has been provided for this image
Performing feature importance analysis...
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==================================================
Linear Regression Tree with Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0166
Mean Squared Error (Test): 0.0499
Mean Squared Error (Total): 0.0366
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==================================================
Linear Regression Tree without Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0292
Mean Squared Error (Test): 0.0305
Mean Squared Error (Total): 0.0300
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

CLASSIFICATION¶

In [5]:
# ============================== CLASSIFICATION ==============================

print()
print()
print('---- CLASSIFICATION ----')
# Features and target
X = df_preprocessed.drop(columns=['gender'])  # Assuming 'gender' is the target variable
y = df_preprocessed['gender']

# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("\nXGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict
y_pred_lgb = lgb_clf.predict(X_test)

# Evaluation
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))

# Helper function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, model_name):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# Helper function to extract and display classification report with model name
def get_classification_report(y_test, y_pred, model_name):
    report = classification_report(y_test, y_pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df['model'] = model_name
    return df

# Random Forest Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
rf_report = get_classification_report(y_test, y_pred_rf, "Random Forest")

# XGBoost Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
xgb_report = get_classification_report(y_test, y_pred_xgb, "XGBoost")

# LightGBM Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_lgb, "LightGBM")
lgb_report = get_classification_report(y_test, y_pred_lgb, "LightGBM")

# Combine all reports
combined_report = pd.concat([rf_report, xgb_report, lgb_report])

# Debugging Step: Check the combined report structure
print("Combined Classification Report:\n", combined_report.head())

# Filter out rows for precision, recall, and f1-score
combined_report_filtered = combined_report[
    combined_report.index.isin(['0', '1'])  # Filter for the classes
].reset_index()

# Debugging Step: Check the filtered report structure
print("Filtered Report for Precision, Recall, and F1-Score:\n", combined_report_filtered.head())

# Plot Precision, Recall, and F1-Score for each model
metrics = ['precision', 'recall', 'f1-score']

for metric in metrics:
    # Debugging Step: Filter for specific metric
    print(f"Data for {metric}:")
    print(combined_report_filtered[['index', metric, 'model']])

    plt.figure(figsize=(10, 6))
    sns.barplot(
        x="index",
        y=metric,
        hue="model",
        data=combined_report_filtered[['index', metric, 'model']]
    )
    plt.title(f'{metric.capitalize()} Comparison')
    plt.ylabel(metric.capitalize())
    plt.xlabel('Class (0 = Human, 1 = Non-Human)')
    plt.show()

# Accuracy comparison
accuracies = {
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'XGBoost': accuracy_score(y_test, y_pred_xgb),
    'LightGBM': accuracy_score(y_test, y_pred_lgb)
}

plt.figure(figsize=(6, 4))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()

---- CLASSIFICATION ----
Accuracy Score:  0.6242038216560509
Confusion Matrix:
 [[661 470 136]
 [284 932 102]
 [250 174 759]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.52      0.54      1267
           1       0.59      0.71      0.64      1318
           2       0.76      0.64      0.70      1183

    accuracy                           0.62      3768
   macro avg       0.64      0.62      0.63      3768
weighted avg       0.63      0.62      0.62      3768


XGBoost Classifier Report:
              precision    recall  f1-score   support

           0       0.56      0.54      0.55      1267
           1       0.61      0.65      0.63      1318
           2       0.72      0.67      0.69      1183

    accuracy                           0.62      3768
   macro avg       0.63      0.62      0.62      3768
weighted avg       0.62      0.62      0.62      3768

Accuracy: 0.6220806794055201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36890
[LightGBM] [Info] Number of data points in the train set: 15068, number of used features: 1766
[LightGBM] [Info] Start training from score -1.117843
[LightGBM] [Info] Start training from score -1.029513
[LightGBM] [Info] Start training from score -1.152536
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1267
           1       0.61      0.65      0.63      1318
           2       0.72      0.69      0.70      1183

    accuracy                           0.63      3768
   macro avg       0.63      0.63      0.63      3768
weighted avg       0.63      0.63      0.63      3768

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Combined Classification Report:
            precision    recall  f1-score      support          model
0           0.553138  0.521705  0.536962  1267.000000  Random Forest
1           0.591371  0.707132  0.644091  1318.000000  Random Forest
2           0.761284  0.641589  0.696330  1183.000000  Random Forest
accuracy    0.624204  0.624204  0.624204     0.624204  Random Forest
macro avg   0.635264  0.623475  0.625794  3768.000000  Random Forest
Filtered Report for Precision, Recall, and F1-Score:
   index  precision    recall  f1-score  support          model
0     0   0.553138  0.521705  0.536962   1267.0  Random Forest
1     1   0.591371  0.707132  0.644091   1318.0  Random Forest
2     0   0.556275  0.542226  0.549161   1267.0        XGBoost
3     1   0.605356  0.651745  0.627695   1318.0        XGBoost
4     0   0.573061  0.554065  0.563403   1267.0       LightGBM
Data for precision:
  index  precision          model
0     0   0.553138  Random Forest
1     1   0.591371  Random Forest
2     0   0.556275        XGBoost
3     1   0.605356        XGBoost
4     0   0.573061       LightGBM
5     1   0.609497       LightGBM
No description has been provided for this image
Data for recall:
  index    recall          model
0     0  0.521705  Random Forest
1     1  0.707132  Random Forest
2     0  0.542226        XGBoost
3     1  0.651745        XGBoost
4     0  0.554065       LightGBM
5     1  0.652504       LightGBM
No description has been provided for this image
Data for f1-score:
  index  f1-score          model
0     0  0.536962  Random Forest
1     1  0.644091  Random Forest
2     0  0.549161        XGBoost
3     1  0.627695        XGBoost
4     0  0.563403       LightGBM
5     1  0.630267       LightGBM
No description has been provided for this image
No description has been provided for this image

ASSOCIATION RULES¶

In [6]:
# ============================== ASSOCIATION RULES ==============================
print()
print()
print('---- ASSOCIATION RULES ----')
# Binarize numeric columns
df_asso['high_favorites'] = df_asso['favorites_per_day'] > df_asso['favorites_per_day'].mean()
df_asso['high_retweets'] = df_asso['retweets_per_day'] > df_asso['retweets_per_day'].mean()
df_asso['high_tweets'] = df_asso['tweets_per_day'] > df_asso['tweets_per_day'].mean()

# Binarize year columns (profile_created_year and tweet_created_year)
# Example: Set threshold year as 2015
df_asso['profile_recent'] = df_asso['profile_created_year'] >= 2015
df_asso['tweet_recent'] = df_asso['tweet_created_year'] >= 2015

# Select only the binary columns
df_apriori = df_asso[['high_favorites', 'high_retweets', 'high_tweets',
                              'profile_recent', 'tweet_recent',
                              'tweet_location_encoded', 'user_timezone_encoded']]

# Convert all columns to int (0 or 1)
df_apriori = df_apriori.astype(int)

# Apply Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the rules
print(rules)


top_frequent_itemsets = frequent_itemsets.nlargest(10, 'support')

plt.figure(figsize=(10, 6))
sns.barplot(x='support', y='itemsets', data=top_frequent_itemsets)
plt.title('Top 10 Frequent Itemsets by Support')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()

# ---------------------------
# Visualization 2: Scatter Plot of Association Rules by Confidence and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(x='confidence', y='lift', size='support', data=rules, hue='antecedents', palette='viridis', sizes=(40, 200))
plt.title('Association Rules: Confidence vs Lift')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.legend(title='Antecedents', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# ---------------------------
# Visualization 3: Heatmap of Support, Confidence, and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.heatmap(rules[['support', 'confidence', 'lift']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Support, Confidence, and Lift')
plt.show()

---- ASSOCIATION RULES ----
                       antecedents                     consequents  \
0                    (high_tweets)                (high_favorites)   
1                 (high_favorites)                   (high_tweets)   
2                   (tweet_recent)                (high_favorites)   
3                 (high_favorites)                  (tweet_recent)   
4                   (tweet_recent)                   (high_tweets)   
5                    (high_tweets)                  (tweet_recent)   
6                   (tweet_recent)                (profile_recent)   
7                 (profile_recent)                  (tweet_recent)   
8      (tweet_recent, high_tweets)                (high_favorites)   
9    (high_favorites, high_tweets)                  (tweet_recent)   
10  (tweet_recent, high_favorites)                   (high_tweets)   
11                   (high_tweets)  (tweet_recent, high_favorites)   
12                  (tweet_recent)   (high_favorites, high_tweets)   
13                (high_favorites)     (tweet_recent, high_tweets)   

    antecedent support  consequent support   support  confidence     lift  \
0             0.271767            0.210607  0.066097    0.243212  1.15481   
1             0.210607            0.271767  0.066097    0.313839  1.15481   
2             1.000000            0.210607  0.210607    0.210607  1.00000   
3             0.210607            1.000000  0.210607    1.000000  1.00000   
4             1.000000            0.271767  0.271767    0.271767  1.00000   
5             0.271767            1.000000  0.271767    1.000000  1.00000   
6             1.000000            0.175568  0.175568    0.175568  1.00000   
7             0.175568            1.000000  0.175568    1.000000  1.00000   
8             0.271767            0.210607  0.066097    0.243212  1.15481   
9             0.066097            1.000000  0.066097    1.000000  1.00000   
10            0.210607            0.271767  0.066097    0.313839  1.15481   
11            0.271767            0.210607  0.066097    0.243212  1.15481   
12            1.000000            0.066097  0.066097    0.066097  1.00000   
13            0.210607            0.271767  0.066097    0.313839  1.15481   

    leverage  conviction  zhangs_metric  
0   0.008861    1.043082       0.184085  
1   0.008861    1.061316       0.169823  
2   0.000000    1.000000       0.000000  
3   0.000000         inf       0.000000  
4   0.000000    1.000000       0.000000  
5   0.000000         inf       0.000000  
6   0.000000    1.000000       0.000000  
7   0.000000         inf       0.000000  
8   0.008861    1.043082       0.184085  
9   0.000000         inf       0.000000  
10  0.008861    1.061316       0.169823  
11  0.008861    1.043082       0.184085  
12  0.000000    1.000000       0.000000  
13  0.008861    1.061316       0.169823  
C:\Users\Owner\uowMaster\subject\946\venv_bda\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type
  warnings.warn(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

AMENDMENT¶

In [9]:
# ============================== AMENDMENT ==============================
print()
print()
print('---- AMENDMENT ----')

mistaken_index = misclassified_df_reg.index.union(misclassified_df_lin_reg.index)
df_truth = df_preprocessed.copy()
df_mistaken = df_preprocessed.loc[mistaken_index].copy()
df_amended = df_mistaken.copy()
vectorized_features = [col for col in df_truth.columns if col.startswith('desc_') or col.startswith('text_')]
df_truth_vectors = df_truth[vectorized_features]
df_mistaken_vectors = df_mistaken[vectorized_features]

similarities = cosine_similarity(df_mistaken_vectors, df_truth_vectors)
best_matches_indices = similarities.argmax(axis=1)
df_amended['gender'] = df_truth.loc[best_matches_indices, 'gender'].values

## Comparative Analysis

# Calculate the number of changes made
num_changes = (df_amended['gender'] != df_mistaken['gender']).sum()

# Calculate the percentage of records amended
percent_amended = (num_changes / len(df_amended)) * 100

## Impact on Statistics

# Function to calculate gender distribution
def gender_distribution(df):
    return df['gender'].value_counts(normalize=True) * 100

# Calculate gender distributions
original_dist = gender_distribution(df_mistaken)
amended_dist = gender_distribution(df_amended)

# Calculate the difference in distributions
dist_difference = amended_dist - original_dist

## Summary Report

print("Amendment Summary Report")
print("=======================")
print(f"Total records processed: {len(df_amended)}")
print(f"Number of records amended: {num_changes}")
print(f"Percentage of records amended: {percent_amended:.2f}%")
print("\nGender Distribution (%):")
print("------------------------")
print("Category    Mistaken    Amended")
for category in original_dist.index:
    print(f"{category:<12} {original_dist.get(category, 0):.2f}        {amended_dist.get(category, 0):.2f}")

print("\nDistribution Changes:")
print("---------------------")
for category in dist_difference.index:
    print(f"{category}: {dist_difference[category]:+.2f}%")

## Create a figure with subplots
fig, axs = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle("Comparative Analysis of Gender Amendment", fontsize=20)

## 1. Bar plot: Gender Distribution Comparison
axs[0, 0].bar(original_dist.index, original_dist.values, alpha=0.5, label='Original')
axs[0, 0].bar(amended_dist.index, amended_dist.values, alpha=0.5, label='Amended')
axs[0, 0].set_title("Gender Distribution Comparison")
axs[0, 0].set_ylabel("Percentage")
axs[0, 0].legend()

## 2. Pie charts: Before and After Amendment
def plot_pie(ax, data, title):
    ax.pie(data.values, labels=data.index, autopct='%1.1f%%', startangle=90)
    ax.set_title(title)

plot_pie(axs[0, 1], original_dist, "Gender Distribution Before Amendment")
plot_pie(axs[1, 0], amended_dist, "Gender Distribution After Amendment")

## 3. Heatmap: Confusion Matrix
cm = confusion_matrix(df_mistaken['gender'], df_amended['gender'], labels=df_mistaken['gender'].unique())
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=df_mistaken['gender'].unique(), 
            yticklabels=df_mistaken['gender'].unique(), ax=axs[1, 1])
axs[1, 1].set_title("Confusion Matrix: After vs Before")
axs[1, 1].set_xlabel("After")
axs[1, 1].set_ylabel("Before")

## Adjust layout and save
plt.tight_layout()
plt.savefig('gender_amendment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

print("Visualizations have been saved as 'gender_amendment_analysis.png'")

---- AMENDMENT ----
Amendment Summary Report
=======================
Total records processed: 3682
Number of records amended: 92
Percentage of records amended: 2.50%

Gender Distribution (%):
------------------------
Category    Mistaken    Amended
1            35.99        34.95
2            33.49        34.71
0            30.53        30.34

Distribution Changes:
---------------------
1: -1.03%
2: +1.22%
0: -0.19%
No description has been provided for this image
Visualizations have been saved as 'gender_amendment_analysis.png'