Installs and Imports¶
In [8]:
import re
import string
import subprocess
import sys
import warnings
warnings.filterwarnings('ignore')
REQS = [
('pip', 'pip==24.2'),
('lightgbm', 'lightgbm==4.5.0'),
('matplotlib', 'matplotlib==3.9.2'),
('mlxtend', 'mlxtend==0.23.1'),
('nltk', 'nltk==3.9.1'),
('numpy', 'numpy==2.0.2'),
('optuna', 'optuna==4.0.0'),
('pandas', 'pandas==2.2.2'),
('seaborn', 'seaborn==0.13.2'),
('sklearn', 'scikit-learn==1.5.2'),
('statsmodels', 'statsmodels==0.14.3'),
('umap-learn', 'umap-learn==0.5.6'),
('xgboost', 'xgboost==2.1.1'),
]
try:
subprocess.check_call([sys.executable, '-m', 'ensurepip'])
except Exception as e:
print(e, file=sys.stderr)
def ensure_installed(module_info):
_, install_str = module_info
try:
subprocess.check_call([sys.executable, '-m',
'pip', 'install', '--quiet',
install_str])
print(f'Installed "{install_str}".')
except Exception as e:
print(e, file=sys.stderr)
for m in REQS:
ensure_installed(m)
# Standard libraries
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# Machine learning and data processing
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
accuracy_score,
calinski_harabasz_score,
classification_report,
confusion_matrix,
mean_squared_error,
silhouette_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Statistical modeling
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Dimensionality reduction
import umap
# Hyperparameter optimization
import optuna
# Other machine learning libraries
import lightgbm as lgb
from xgboost import XGBClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
def find_columns_with_missing(data, columns):
"""Finding features that have a lot of missing data"""
print()
print('Finding columns with missing data...')
data_cleaned = data
missing = []
i = 0
for col in columns:
missing.append(data[col].isnull().sum())
if missing[i] > 0:
print()
print(f'Column {col} is missing {missing[i]} values.')
print(f'Proportion of missing data is {missing[i]/len(data)}.')
if missing[i]/len(data) >= 0.9:
print(f'Dropping column {col}...')
data_cleaned = data_cleaned.drop(columns=col)
i += 1
return missing, data_cleaned
def hex_to_rgb(hex_color):
"""Function to convert hex to RGB"""
# Remove the '#' if it exists
hex_color = hex_color.lstrip('#')
# Convert hex to integer and split into RGB components
return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]
def preprocess_text(text):
"""Preprocessing function"""
text = text.lower()
# Remove punctuation and special characters
text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuation
text = re.sub(r'[^A-Za-z\s]', '', text)
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize the tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Join tokens back into a string
return ' '.join(tokens)
def plot_silhouette_bar_across_experiments(model_names, silhouette_scores):
n_experiments = len(silhouette_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
sil_scores = [exp_scores[i] for exp_scores in silhouette_scores]
plt.bar(index + i * bar_width,sil_scores, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Silhouette scores')
plt.title('Silhouette scores Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
def visualize_ch_index_across_experiments(model_names, ch_scores):
n_experiments = len(ch_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
ch_score = [exp_scores[i] for exp_scores in ch_scores]
plt.bar(index + i * bar_width, ch_score, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
class KMeansClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.kmeans_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_kmeans(trial):
n_clusters = trial.suggest_int('n_clusters', 2, 10)
init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=42)
kmeans.fit(self.data)
labels = kmeans.labels_
score = silhouette_score(self.data, labels)
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_kmeans, n_trials=n_trials)
self.best_params = study.best_params
print("Best params:", self.best_params)
def fit_model(self):
self.kmeans_model = KMeans(n_clusters=self.best_params['n_clusters'],
init=self.best_params['init'],
random_state=42)
self.kmeans_model.fit(self.data)
def visualize_clusters(self, umap_embedding, feature):
labels = self.kmeans_model.labels_
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Scatter plot in 3D
scatter = ax.scatter(
umap_embedding[:, 0],
umap_embedding[:, 1],
umap_embedding[:, 2],
c=labels,
cmap='viridis',
s=30
)
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
plt.title(f'3D UMAP of K-Means Clusters on {feature}')
# Add a color bar for better visual distinction of clusters
plt.colorbar(scatter)
# Show the plot
plt.show()
def plot_elbow_method(self, k_range=(2, 10)):
"""
Plot the Elbow Method for choosing the optimal number of clusters
Args:
- k_range: tuple, range of cluster numbers to evaluate
"""
inertia = []
K = range(k_range[0], k_range[1] + 1)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(self.data)
inertia.append(kmeans.inertia_) # Sum of squared distances to closest cluster center
plt.figure(figsize=(8, 6))
plt.plot(K, inertia, 'bo-', markersize=8)
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (Sum of squared distances)')
plt.grid(True)
plt.show()
def output_label(self):
return self.kmeans_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.kmeans_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.kmeans_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.kmeans_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class DBSCANClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.dbscan_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_dbscan(trial):
eps = trial.suggest_float('eps', 0.1, 2.0)
min_samples = trial.suggest_int('min_samples', 3, 20)
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(self.data)
labels = dbscan.labels_
if len(set(labels)) > 1:
score = silhouette_score(self.data, labels)
else:
score = -1
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_dbscan, n_trials=n_trials)
self.best_params = study.best_params
print("Found best params:", self.best_params)
def fit_model(self):
self.dbscan_model = DBSCAN(eps=self.best_params['eps'], min_samples=self.best_params['min_samples'])
self.dbscan_model.fit(self.data)
def visualize_clusters_and_outliers_3D(self, umap_embedding, feature):
labels = self.dbscan_model.labels_
# Separate clustered points and noise points
clustered_points = umap_embedding[labels >= 0] # Points part of a cluster
clustered_labels = labels[labels >= 0]
outliers = umap_embedding[labels == -1] # Noise points
# Create a 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
# Plot the clustered points in different colors
scatter = ax.scatter(clustered_points[:, 0], clustered_points[:, 1], clustered_points[:, 2],
c=clustered_labels, cmap='viridis', s=30)
# Plot the outliers (noise points) in red with 'x' markers
ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red', marker='x', s=80, label='Outliers')
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
ax.set_title(f'DBSCAN 3D Clusters with Outliers on {feature}')
# Add a legend and color bar for clusters
plt.legend()
plt.colorbar(scatter, ax=ax)
plt.show()
def output_label(self):
return self.dbscan_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.dbscan_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.dbscan_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.dbscan_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class ClusteringDataRetriever:
def __init__(self, data, labels):
self.data = data
self.labels = labels
def get_data_with_labels(self):
# If Data is in a numpy array, convert it to a pandas DataFrame
if isinstance(self.data, np.ndarray):
df = pd.DataFrame(self.data)
else:
df = self.data.copy() # If already a DataFrame
# Add a new column for the cluster labels
df['Cluster_Label'] = self.labels
return df[['gender', 'gender:confidence', 'Cluster_Label']]
def get_cluster_data(self, cluster_label):
# Retrieve data points belonging to a specific cluster.
df = self.get_data_with_labels()
return df[df['Cluster_Label'] == cluster_label]
def get_noise_data(self):
# Retrieve Records classified as noise (-1 label) in DBSCAN.
return self.get_cluster_data(-1)
Installed "pip==24.2". Installed "lightgbm==4.5.0". Installed "matplotlib==3.9.2". Installed "mlxtend==0.23.1". Installed "nltk==3.9.1". Installed "numpy==2.0.2". Installed "optuna==4.0.0". Installed "pandas==2.2.2". Installed "seaborn==0.13.2". Installed "scikit-learn==1.5.2". Installed "statsmodels==0.14.3". Installed "umap-learn==0.5.6". Installed "xgboost==2.1.1".
EDA¶
In [2]:
# Main starts here
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Quick view of the dataset
print()
print('Dataset Overview')
print(df.info())
print(df.head())
all_features = df.columns
missing_col, df_cleaned = find_columns_with_missing(df, all_features)
# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis
print()
print('Dataset Overview')
print(df_cleaned.info())
print(df_cleaned.head())
print()
print('---- EXPLORATORY DATA ANALYSIS (EDA) ----')
current_num_features = df.select_dtypes(include=[np.number])
# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
plt.title(f'Distribution of {feature} by Gender')
plt.show()
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
plt.figure(figsize=(8, 6))
sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
plt.show()
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
# assuming Data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days
df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()
# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()
# show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()
# Exploring 'link_color' and 'sidebar_color' features
# Check number of NaN value in 'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()
print()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}.")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}.")
# Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'Number of link color is {link_color_count}.')
print(f'Number of side bar color is {sidebar_color_count}.')
# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")
# top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
# print(top_sidebar_colors)
# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()
# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()
# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)
# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']
# categorise types of features
# numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()
# categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)
freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)
# gender features
# encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Check for unique values in the 'gender' column after replacement
print()
print("Unique Values in 'gender'")
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()
# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)
# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)
# Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])
# Check if all required features are there
print()
print('All Remaining Features')
print(df_preprocessed.columns.tolist())
# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
# print(f'All current numerical features are {numerical_features.columns.tolist()}')
print()
print('Dataset Overview After PreProcessing')
print(df_preprocessed.info())
print()
print('---- NLP Processing ----')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
# df_preprocessed['name'].fillna('', inplace=True)
# Check the text features if they still contain NaN
print()
print(df_preprocessed.select_dtypes(include=[object]))
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
# df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))
# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())
# Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])
# Initialize TFIDF vectorizer for text features
print()
print('Applying TF-IDF Vectorisation...')
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
# Apply TF-IDF on 'description', 'text', 'name' columns
tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
# tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()
# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
# tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])
# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)
# Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])
df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)
df_asso = df_preprocessed.copy()
df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
Dataset Overview <class 'pandas.core.frame.DataFrame'> RangeIndex: 20050 entries, 0 to 20049 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 20050 non-null int64 1 _golden 20050 non-null bool 2 _unit_state 20050 non-null object 3 _trusted_judgments 20050 non-null int64 4 _last_judgment_at 20000 non-null object 5 gender 19953 non-null object 6 gender:confidence 20024 non-null float64 7 profile_yn 20050 non-null object 8 profile_yn:confidence 20050 non-null float64 9 created 20050 non-null object 10 description 16306 non-null object 11 fav_number 20050 non-null int64 12 gender_gold 50 non-null object 13 link_color 20050 non-null object 14 name 20050 non-null object 15 profile_yn_gold 50 non-null object 16 profileimage 20050 non-null object 17 retweet_count 20050 non-null int64 18 sidebar_color 20050 non-null object 19 text 20050 non-null object 20 tweet_coord 159 non-null object 21 tweet_count 20050 non-null int64 22 tweet_created 20050 non-null object 23 tweet_id 20050 non-null float64 24 tweet_location 12565 non-null object 25 user_timezone 12252 non-null object dtypes: bool(1), float64(3), int64(5), object(17) memory usage: 3.8+ MB None _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \ 0 815719226 False finalized 3 10/26/15 23:24 1 815719227 False finalized 3 10/26/15 23:30 2 815719228 False finalized 3 10/26/15 23:33 3 815719229 False finalized 3 10/26/15 23:10 4 815719230 False finalized 3 10/27/15 1:15 gender gender:confidence profile_yn profile_yn:confidence \ 0 male 1.0000 yes 1.0 1 male 1.0000 yes 1.0 2 male 0.6625 yes 1.0 3 male 1.0000 yes 1.0 4 female 1.0000 yes 1.0 created ... profileimage \ 0 12/5/13 1:48 ... https://pbs.twimg.com/profile_images/414342229... 1 10/1/12 13:51 ... https://pbs.twimg.com/profile_images/539604221... 2 11/28/14 11:30 ... https://pbs.twimg.com/profile_images/657330418... 3 6/11/09 22:39 ... https://pbs.twimg.com/profile_images/259703936... 4 4/16/14 13:23 ... https://pbs.twimg.com/profile_images/564094871... retweet_count sidebar_color \ 0 0 FFFFFF 1 0 C0DEED 2 1 C0DEED 3 0 C0DEED 4 0 0 text tweet_coord tweet_count \ 0 Robbie E Responds To Critics After Win Against... NaN 110964 1 ÛÏIt felt like they were my friends and I was... NaN 7471 2 i absolutely adore when louis starts the songs... NaN 5617 3 Hi @JordanSpieth - Looking at the url - do you... NaN 1693 4 Watching Neighbours on Sky+ catching up with t... NaN 31462 tweet_created tweet_id tweet_location user_timezone 0 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai 1 10/26/15 12:40 6.587300e+17 NaN Eastern Time (US & Canada) 2 10/26/15 12:40 6.587300e+17 clcncl Belgrade 3 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada) 4 10/26/15 12:40 6.587300e+17 NaN NaN [5 rows x 26 columns] Finding columns with missing data... Column _last_judgment_at is missing 50 values. Proportion of missing data is 0.0024937655860349127. Column gender is missing 97 values. Proportion of missing data is 0.00483790523690773. Column gender:confidence is missing 26 values. Proportion of missing data is 0.0012967581047381546. Column description is missing 3744 values. Proportion of missing data is 0.18673316708229426. Column gender_gold is missing 20000 values. Proportion of missing data is 0.9975062344139651. Dropping column gender_gold... Column profile_yn_gold is missing 20000 values. Proportion of missing data is 0.9975062344139651. Dropping column profile_yn_gold... Column tweet_coord is missing 19891 values. Proportion of missing data is 0.992069825436409. Dropping column tweet_coord... Column tweet_location is missing 7485 values. Proportion of missing data is 0.3733167082294264. Column user_timezone is missing 7798 values. Proportion of missing data is 0.388927680798005. Dataset Overview <class 'pandas.core.frame.DataFrame'> Index: 19953 entries, 0 to 20049 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 19953 non-null int64 1 _golden 19953 non-null bool 2 _unit_state 19953 non-null object 3 _trusted_judgments 19953 non-null int64 4 _last_judgment_at 19903 non-null object 5 gender 19953 non-null object 6 gender:confidence 19953 non-null float64 7 profile_yn:confidence 19953 non-null float64 8 created 19953 non-null object 9 description 16224 non-null object 10 fav_number 19953 non-null int64 11 link_color 19953 non-null object 12 name 19953 non-null object 13 profileimage 19953 non-null object 14 retweet_count 19953 non-null int64 15 sidebar_color 19953 non-null object 16 text 19953 non-null object 17 tweet_count 19953 non-null int64 18 tweet_created 19953 non-null object 19 tweet_id 19953 non-null float64 20 tweet_location 12510 non-null object 21 user_timezone 12185 non-null object dtypes: bool(1), float64(3), int64(5), object(13) memory usage: 3.4+ MB None _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \ 0 815719226 False finalized 3 10/26/15 23:24 1 815719227 False finalized 3 10/26/15 23:30 2 815719228 False finalized 3 10/26/15 23:33 3 815719229 False finalized 3 10/26/15 23:10 4 815719230 False finalized 3 10/27/15 1:15 gender gender:confidence profile_yn:confidence created \ 0 male 1.0000 1.0 12/5/13 1:48 1 male 1.0000 1.0 10/1/12 13:51 2 male 0.6625 1.0 11/28/14 11:30 3 male 1.0000 1.0 6/11/09 22:39 4 female 1.0000 1.0 4/16/14 13:23 description ... name \ 0 i sing my own rhythm. ... sheezy0 1 I'm the author of novels filled with family dr... ... DavdBurnett 2 louis whining and squealing and all ... lwtprettylaugh 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... douggarland 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... WilfordGemma profileimage retweet_count \ 0 https://pbs.twimg.com/profile_images/414342229... 0 1 https://pbs.twimg.com/profile_images/539604221... 0 2 https://pbs.twimg.com/profile_images/657330418... 1 3 https://pbs.twimg.com/profile_images/259703936... 0 4 https://pbs.twimg.com/profile_images/564094871... 0 sidebar_color text \ 0 FFFFFF Robbie E Responds To Critics After Win Against... 1 C0DEED ÛÏIt felt like they were my friends and I was... 2 C0DEED i absolutely adore when louis starts the songs... 3 C0DEED Hi @JordanSpieth - Looking at the url - do you... 4 0 Watching Neighbours on Sky+ catching up with t... tweet_count tweet_created tweet_id tweet_location \ 0 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 1 7471 10/26/15 12:40 6.587300e+17 NaN 2 5617 10/26/15 12:40 6.587300e+17 clcncl 3 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA 4 31462 10/26/15 12:40 6.587300e+17 NaN user_timezone 0 Chennai 1 Eastern Time (US & Canada) 2 Belgrade 3 Pacific Time (US & Canada) 4 NaN [5 rows x 22 columns] ---- EXPLORATORY DATA ANALYSIS (EDA) ----
Number of NaN values in 'link_color': 0. Number of NaN values in 'sidebar_color': 0. Number of link color is 2986. Number of side bar color is 559. Number of NaN values in 'link_color': 0 Number of NaN values in 'sidebar_color': 0
Unique Values in 'gender' [0 1 2] <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 link_color 18836 non-null object 5 retweets_per_day 18836 non-null float64 6 sidebar_color 18836 non-null object 7 text 18836 non-null object 8 tweets_per_day 18836 non-null float64 9 user_timezone 18836 non-null object 10 tweet_location 18836 non-null object 11 profile_created_year 18836 non-null int32 12 tweet_created_year 18836 non-null int32 13 tweet_location_encoded 18836 non-null float64 14 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(6) memory usage: 2.2+ MB None
All Remaining Features ['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded'] Dataset Overview After PreProcessing <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 retweets_per_day 18836 non-null float64 5 text 18836 non-null object 6 tweets_per_day 18836 non-null float64 7 profile_created_year 18836 non-null int32 8 tweet_created_year 18836 non-null int32 9 tweet_location_encoded 18836 non-null float64 10 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(2) memory usage: 1.6+ MB None ---- NLP Processing ----
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt_tab is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
description \ 0 i sing my own rhythm. 1 I'm the author of novels filled with family dr... 2 louis whining and squealing and all 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... ... 20045 (rp) 20046 Whatever you like, it's not a problem at all. ... 20047 #TeamBarcelona ..You look lost so you should f... 20048 Anti-statist; I homeschool my kids. Aspiring t... 20049 Teamwork makes the dream work. text 0 Robbie E Responds To Critics After Win Against... 1 ÛÏIt felt like they were my friends and I was... 2 i absolutely adore when louis starts the songs... 3 Hi @JordanSpieth - Looking at the url - do you... 4 Watching Neighbours on Sky+ catching up with t... ... ... 20045 @lookupondeath ...Fine, and I'll drink tea too... 20046 Greg Hardy you a good player and all but don't... 20047 You can miss people and still never want to se... 20048 @bitemyapp i had noticed your tendency to pee ... 20049 I think for my APUSH creative project I'm goin... [18836 rows x 2 columns] description \ 0 i sing my own rhythm. 1 I'm the author of novels filled with family dr... 2 louis whining and squealing and all 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... cleaned_description \ 0 sing rhythm 1 im author novel filled family drama romance 2 louis whining squealing 3 mobile guy er shazam google kleiner perkins ya... 4 ricky wilson best frontmankaiser chief best ba... text \ 0 Robbie E Responds To Critics After Win Against... 1 ÛÏIt felt like they were my friends and I was... 2 i absolutely adore when louis starts the songs... 3 Hi @JordanSpieth - Looking at the url - do you... 4 Watching Neighbours on Sky+ catching up with t... cleaned_text 0 robbie e responds critic win eddie edward worl... 1 felt like friend living story httpstcoarngeyhn... 2 absolutely adore louis start song hit hard fee... 3 hi jordanspieth looking url use ifttt dont typ... 4 watching neighbour sky catching neighbs xxx xxx Applying TF-IDF Vectorisation...
CLUSTERING¶
In [3]:
print()
print()
print('---- CLUSTERING MODELS ----')
print()
print("=" * 50)
print('EXP 1: USING ALL SELECTED FEATURES')
print("=" * 50)
sil_ex1 = []
cal_ex1 = []
# Drop the gender and categorical features before normalise
df_cat = df_cate.copy()
# Drop gender feature and categorical features
df_preprocessed = df_preprocessed.drop(columns=df_cat.columns)
df_finalised = df_preprocessed.drop(columns=['gender', 'gender:confidence'])
# Normalise every existing feature
scaler = StandardScaler()
df_finalised = pd.DataFrame(scaler.fit_transform(df_finalised), columns=df_finalised.columns)
df_finalised = pd.concat([df_finalised, df_cat, df_gender], axis=1)
# find the rows that contained NaN values and drop them
df_finalised = df_finalised.dropna()
data_exp1 = df_finalised
df_ex1 = df_finalised.drop(columns=['gender', 'gender:confidence'])
# Check the preprocessed dataset in the present
print()
print('Dataset for Exp 1')
print(df_ex1.info())
print()
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_vis = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(df_ex1)
umap_plot = umap_vis.fit_transform(df_ex1)
print(umap_embedding.shape)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding)
kmeans_clustering.tune_hyperparameters()
kmeans_exp1 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_plot, 'All feature types')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex1.append(kmeans_clustering.silhoutte())
cal_ex1.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(data_exp1, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 1')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 1')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding)
dbscan_clustering.tune_hyperparameters()
dbscan_exp1 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_plot, 'All feature types')
db_labels = dbscan_clustering.output_label()
sil_ex1.append(dbscan_clustering.silhoutte())
cal_ex1.append(dbscan_clustering.calinski())
# Initialize the class to retrieve data
db_retriever = ClusteringDataRetriever(data_exp1, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 1')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 1')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES')
print("=" * 50)
sil_ex2 = []
cal_ex2 = []
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_num.shape[0], chunk_size):
df_num.iloc[i:i + chunk_size] = scaler.fit_transform(df_num.iloc[i:i + chunk_size])
df_no_text = pd.concat([df_num, df_cate, df_gender], axis=1)
print()
print("Data with Only Numerical and Categorical Features")
print(df_no_text.info())
print()
df_no_text = df_no_text.dropna()
df_no_text_wg = df_no_text.copy()
print('Removing NaN values...')
# Drop gender feature before clustering
data_exp2 = df_no_text.drop(columns=['gender', 'gender:confidence'])
print('Dropping gender and gender:confidence...')
# Check No. of records after drop NaN values
print()
print("Dataset for Exp 2")
print(data_exp2.info())
print()
print(data_exp2.head())
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(data_exp2)
print(umap_embedding.shape)
# umap_embedding = umap_embedding.astype(np.float32)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(data_exp2)
kmeans_clustering.tune_hyperparameters()
kmeans_exp2 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Numerical and categorical features') # Visualize clusters
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex2.append(kmeans_clustering.silhoutte())
cal_ex2.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_no_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 2')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 2')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(data_exp2)
dbscan_clustering.tune_hyperparameters() # Tune DBSCAN hyperparameters
dbscan_exp2 = dbscan_clustering.fit_model() # Fit the DBSCAN model
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'numerical and categorical features') # Plot 3D noise points and valid clusters
db_labels = dbscan_clustering.output_label()
sil_ex2.append(dbscan_clustering.silhoutte())
cal_ex2.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_no_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 2')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 2')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 3: USING ONLY TEXT FEATURES')
print("=" * 50)
sil_ex3 = []
cal_ex3 = []
# Merge with main dataframe
df_with_text = pd.concat([tfidf_desc_df, tfidf_text_df], axis=1)
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_with_text.shape[0], chunk_size):
df_with_text.iloc[i:i + chunk_size] = scaler.fit_transform(df_with_text.iloc[i:i + chunk_size])
df_with_text_wg = pd.concat([df_with_text, df_gender], axis=1)
# Drop NaN values before clustering
df_with_text_wg = df_with_text_wg.dropna()
data_exp3 = df_with_text_wg.drop(columns=['gender', 'gender:confidence'])
# Drop the gender features before clustering
print('Dataset for Exp 3')
print(data_exp3.info())
print()
print(data_exp3.head())
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_embedding_t = umap_model.fit_transform(data_exp3)
umap_embedding = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42).fit_transform(data_exp3)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding_t)
kmeans_clustering.tune_hyperparameters()
kmeans_exp3 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Text features')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex3.append(kmeans_clustering.silhoutte())
cal_ex3.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_with_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 3')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 3')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCANClustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding_t)
dbscan_clustering.tune_hyperparameters()
dbscan_exp3 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'Text features')
db_labels = dbscan_clustering.output_label()
sil_ex3.append(dbscan_clustering.silhoutte())
cal_ex3.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_with_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 3')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 3')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print('---- VISUALIZE THE METRIC EVALUATION ----')
# Metric functions
model_names = ['KMeans', 'DBSCAN']
sil_scores = [sil_ex1, sil_ex2, sil_ex3]
cal_scores = [cal_ex1, cal_ex2, cal_ex3]
plot_silhouette_bar_across_experiments(model_names, sil_scores)
visualize_ch_index_across_experiments(model_names, cal_scores)
---- CLUSTERING MODELS ---- ================================================== EXP 1: USING ALL SELECTED FEATURES ================================================== Dataset for Exp 1 <class 'pandas.core.frame.DataFrame'> Index: 17702 entries, 0 to 18835 Columns: 3013 entries, favorites_per_day to user_timezone_encoded dtypes: float64(3013) memory usage: 407.1 MB None Applying UMAP for dim reduction...
[I 2024-09-20 16:20:19,495] A new study created in memory with name: no-name-f656c2a4-43f7-454b-87f6-e1b8bbb5ba19
(17702, 2) Performing K-Means Clustering...
[I 2024-09-20 16:20:24,756] Trial 0 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:29,547] Trial 1 finished with value: 0.40816256403923035 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:34,470] Trial 2 finished with value: 0.43370768427848816 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:39,242] Trial 3 finished with value: 0.4106582999229431 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:44,060] Trial 4 finished with value: 0.3901534974575043 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:48,864] Trial 5 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:53,940] Trial 6 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:20:58,788] Trial 7 finished with value: 0.3933861553668976 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:21:03,521] Trial 8 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:21:08,516] Trial 9 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867. [I 2024-09-20 16:21:13,893] Trial 10 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243. [I 2024-09-20 16:21:19,213] Trial 11 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243. [I 2024-09-20 16:21:24,493] Trial 12 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243. [I 2024-09-20 16:21:29,872] Trial 13 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243. [I 2024-09-20 16:21:35,112] Trial 14 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
Best params: {'n_clusters': 2, 'init': 'random'}
The Silhouette score is 0.7726734280586243 The Callinski index is 20992.505859375 Dataset with Labels from KMeans in Exp 1 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 Records found in cluster 0 from KMeans in Exp 1 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 ... ... ... ... 18829 1.0 1.0000 0 18831 0.0 0.6466 0 18832 1.0 1.0000 0 18834 1.0 1.0000 0 18835 0.0 0.6772 0 [16379 rows x 3 columns] No. of records with gender 0 in cluster 0 is 5439 No. of records with gender 1 in cluster 0 is 5773 No. of records with gender 2 in cluster 0 is 5167 Records found in cluster 1 from KMeans in Exp 1
[I 2024-09-20 16:21:42,026] A new study created in memory with name: no-name-ad1593d8-66bc-4c0f-9d74-f56f96710d50
gender gender:confidence Cluster_Label 7 0.0 1.0000 1 33 0.0 1.0000 1 49 2.0 1.0000 1 56 1.0 0.6684 1 58 0.0 1.0000 1 ... ... ... ... 18738 2.0 1.0000 1 18753 0.0 0.6678 1 18759 0.0 0.6386 1 18789 0.0 1.0000 1 18803 1.0 1.0000 1 [1323 rows x 3 columns] No. of records with gender 0 in cluster 1 is 404 No. of records with gender 1 in cluster 1 is 428 No. of records with gender 2 in cluster 1 is 491 Performing DBSCAN Clustering...
[I 2024-09-20 16:21:48,312] Trial 0 finished with value: 0.3155621588230133 and parameters: {'eps': 1.5913067486466435, 'min_samples': 6}. Best is trial 0 with value: 0.3155621588230133. [I 2024-09-20 16:21:54,152] Trial 1 finished with value: 0.24721910059452057 and parameters: {'eps': 1.0376530894652887, 'min_samples': 18}. Best is trial 0 with value: 0.3155621588230133. [I 2024-09-20 16:22:00,118] Trial 2 finished with value: 0.2345193773508072 and parameters: {'eps': 1.08924832783019, 'min_samples': 7}. Best is trial 0 with value: 0.3155621588230133. [I 2024-09-20 16:22:06,672] Trial 3 finished with value: 0.3255881667137146 and parameters: {'eps': 1.9565357155432446, 'min_samples': 4}. Best is trial 3 with value: 0.3255881667137146. [I 2024-09-20 16:22:13,131] Trial 4 finished with value: 0.32468611001968384 and parameters: {'eps': 1.9655521749248066, 'min_samples': 17}. Best is trial 3 with value: 0.3255881667137146. [I 2024-09-20 16:22:19,013] Trial 5 finished with value: 0.26063308119773865 and parameters: {'eps': 0.9674339846692939, 'min_samples': 14}. Best is trial 3 with value: 0.3255881667137146. [I 2024-09-20 16:22:25,335] Trial 6 finished with value: 0.32788148522377014 and parameters: {'eps': 1.7693479090782473, 'min_samples': 9}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:22:31,052] Trial 7 finished with value: 0.24578818678855896 and parameters: {'eps': 0.7826789736238435, 'min_samples': 19}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:22:36,540] Trial 8 finished with value: -0.14658115804195404 and parameters: {'eps': 0.34017243144029763, 'min_samples': 4}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:22:42,106] Trial 9 finished with value: 0.0954396203160286 and parameters: {'eps': 0.490850883967341, 'min_samples': 20}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:22:48,410] Trial 10 finished with value: 0.24460361897945404 and parameters: {'eps': 1.4333032533727734, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:22:54,901] Trial 11 finished with value: 0.32556405663490295 and parameters: {'eps': 1.9767937461657843, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014. [I 2024-09-20 16:23:01,305] Trial 12 finished with value: 0.33137843012809753 and parameters: {'eps': 1.6198251417047203, 'min_samples': 3}. Best is trial 12 with value: 0.33137843012809753. [I 2024-09-20 16:23:07,690] Trial 13 finished with value: 0.32246026396751404 and parameters: {'eps': 1.528098496701475, 'min_samples': 13}. Best is trial 12 with value: 0.33137843012809753. [I 2024-09-20 16:23:14,050] Trial 14 finished with value: 0.3302082121372223 and parameters: {'eps': 1.6778064207338765, 'min_samples': 8}. Best is trial 12 with value: 0.33137843012809753.
Found best params: {'eps': 1.6198251417047203, 'min_samples': 3}
The Silhouette score is 0.33137843012809753 The Callinski index is 1748.1387939453125 Dataset with Labels from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 Records found in cluster 0 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 ... ... ... ... 18829 1.0 1.0000 0 18831 0.0 0.6466 0 18832 1.0 1.0000 0 18834 1.0 1.0000 0 18835 0.0 0.6772 0 [15976 rows x 3 columns] No. of records with gender 0 in cluster 0 is 5308 No. of records with gender 1 in cluster 0 is 5667 No. of records with gender 2 in cluster 0 is 5001 Records found in cluster 1 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 7 0.0 1.0000 1 33 0.0 1.0000 1 49 2.0 1.0000 1 56 1.0 0.6684 1 58 0.0 1.0000 1 132 1.0 1.0000 1 153 2.0 1.0000 1 191 2.0 0.6804 1 192 0.0 1.0000 1 199 1.0 1.0000 1 231 1.0 1.0000 1 243 0.0 1.0000 1 250 2.0 1.0000 1 288 1.0 0.6494 1 308 1.0 0.6752 1 390 1.0 0.6786 1 460 2.0 0.6708 1 503 0.0 1.0000 1 No. of records with gender 0 in cluster 1 is 6 No. of records with gender 1 in cluster 1 is 7 No. of records with gender 2 in cluster 1 is 5 Records found in cluster 2 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 282 1.0 1.0000 2 2135 2.0 1.0000 2 2929 0.0 1.0000 2 3229 0.0 1.0000 2 3770 0.0 1.0000 2 ... ... ... ... 9194 2.0 1.0000 2 9195 1.0 1.0000 2 9220 2.0 1.0000 2 9283 2.0 0.6659 2 9293 0.0 1.0000 2 [180 rows x 3 columns] No. of records with gender 0 in cluster 2 is 55 No. of records with gender 1 in cluster 2 is 48 No. of records with gender 2 in cluster 2 is 77 Records found in cluster 3 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 302 1.0 1.0000 3 1402 0.0 0.3539 3 2347 2.0 0.6757 3 2964 1.0 1.0000 3 4898 0.0 1.0000 3 5276 2.0 0.6632 3 5379 0.0 1.0000 3 5536 2.0 0.6943 3 5949 1.0 0.6848 3 6017 1.0 0.3486 3 6245 2.0 1.0000 3 6298 0.0 1.0000 3 6374 2.0 1.0000 3 6466 2.0 1.0000 3 6882 0.0 0.6879 3 6904 2.0 0.6842 3 7434 2.0 1.0000 3 7625 0.0 1.0000 3 7662 0.0 1.0000 3 7745 1.0 1.0000 3 7811 2.0 0.6341 3 7910 2.0 1.0000 3 8159 2.0 1.0000 3 8331 2.0 0.6716 3 8340 2.0 0.6707 3 8401 0.0 0.6732 3 8487 0.0 0.6806 3 8489 0.0 1.0000 3 8505 1.0 1.0000 3 8535 2.0 1.0000 3 8583 0.0 1.0000 3 8622 0.0 0.6634 3 8623 2.0 0.6778 3 8647 2.0 1.0000 3 8690 2.0 1.0000 3 8764 2.0 0.6674 3 8784 2.0 1.0000 3 8859 2.0 1.0000 3 8925 0.0 1.0000 3 8930 2.0 1.0000 3 8971 1.0 1.0000 3 9001 1.0 1.0000 3 9055 1.0 1.0000 3 9076 2.0 1.0000 3 9089 1.0 1.0000 3 9118 2.0 0.6712 3 9166 2.0 1.0000 3 9280 1.0 1.0000 3 14662 2.0 1.0000 3 15096 2.0 0.3410 3 15533 1.0 0.6619 3 15979 0.0 1.0000 3 16380 0.0 1.0000 3 16802 2.0 0.3531 3 17226 1.0 1.0000 3 17617 1.0 1.0000 3 18272 0.0 0.6686 3 No. of records with gender 0 in cluster 3 is 16 No. of records with gender 1 in cluster 3 is 14 No. of records with gender 2 in cluster 3 is 27 Records found in cluster 4 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 426 2.0 1.0000 4 432 0.0 1.0000 4 1992 0.0 1.0000 4 2776 0.0 1.0000 4 3755 2.0 1.0000 4 3769 2.0 0.6497 4 3784 2.0 1.0000 4 4418 1.0 1.0000 4 5352 1.0 1.0000 4 9341 2.0 1.0000 4 9379 0.0 1.0000 4 10138 1.0 1.0000 4 10451 0.0 0.6824 4 13349 0.0 1.0000 4 14425 0.0 0.6628 4 14668 2.0 1.0000 4 16449 1.0 1.0000 4 16881 1.0 0.6733 4 No. of records with gender 0 in cluster 4 is 7 No. of records with gender 1 in cluster 4 is 5 No. of records with gender 2 in cluster 4 is 6 Records found in cluster 5 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 431 0.0 0.6631 5 4374 2.0 1.0000 5 4456 1.0 1.0000 5 4653 2.0 1.0000 5 5008 2.0 1.0000 5 5044 2.0 1.0000 5 5220 2.0 0.6650 5 5533 2.0 1.0000 5 5580 0.0 1.0000 5 5596 2.0 1.0000 5 5662 1.0 1.0000 5 5749 2.0 1.0000 5 5988 2.0 1.0000 5 6669 0.0 1.0000 5 7261 0.0 1.0000 5 7702 2.0 0.7012 5 7771 2.0 1.0000 5 7898 2.0 1.0000 5 8120 1.0 1.0000 5 8248 1.0 1.0000 5 8295 2.0 0.6579 5 8360 2.0 0.6854 5 8984 2.0 0.6890 5 9100 0.0 1.0000 5 No. of records with gender 0 in cluster 5 is 5 No. of records with gender 1 in cluster 5 is 4 No. of records with gender 2 in cluster 5 is 15 Records found in cluster 6 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 502 0.0 1.0000 6 578 1.0 1.0000 6 644 0.0 1.0000 6 771 0.0 1.0000 6 963 2.0 1.0000 6 1433 1.0 1.0000 6 1881 0.0 0.6691 6 2762 2.0 0.6670 6 2903 1.0 0.6763 6 3308 0.0 0.3364 6 3353 0.0 1.0000 6 3681 2.0 1.0000 6 3830 0.0 1.0000 6 4305 1.0 1.0000 6 5040 0.0 1.0000 6 5479 0.0 0.6857 6 5742 0.0 1.0000 6 6460 2.0 1.0000 6 6862 1.0 1.0000 6 8397 2.0 0.6634 6 8516 2.0 0.6839 6 8918 2.0 1.0000 6 No. of records with gender 0 in cluster 6 is 10 No. of records with gender 1 in cluster 6 is 5 No. of records with gender 2 in cluster 6 is 7 Records found in cluster 7 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 513 2.0 1.0000 7 514 0.0 1.0000 7 520 0.0 0.3458 7 553 0.0 1.0000 7 554 0.0 0.3431 7 555 0.0 1.0000 7 556 0.0 1.0000 7 557 0.0 1.0000 7 560 1.0 1.0000 7 564 1.0 1.0000 7 565 1.0 1.0000 7 566 2.0 0.6829 7 576 0.0 1.0000 7 577 2.0 1.0000 7 1102 1.0 0.6777 7 2660 0.0 0.3478 7 4100 2.0 1.0000 7 4344 2.0 1.0000 7 4370 0.0 1.0000 7 4426 2.0 0.6838 7 4444 0.0 0.6422 7 4489 1.0 1.0000 7 4643 0.0 1.0000 7 4781 2.0 0.6475 7 4896 2.0 1.0000 7 4950 1.0 1.0000 7 4967 0.0 1.0000 7 5030 0.0 1.0000 7 5176 1.0 1.0000 7 5256 2.0 0.6475 7 5355 0.0 1.0000 7 5356 0.0 1.0000 7 5427 1.0 1.0000 7 5448 2.0 0.6654 7 7995 2.0 1.0000 7 8037 0.0 0.6374 7 8233 0.0 1.0000 7 10824 0.0 1.0000 7 No. of records with gender 0 in cluster 7 is 19 No. of records with gender 1 in cluster 7 is 8 No. of records with gender 2 in cluster 7 is 11 Records found in cluster 8 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 548 2.0 0.6672 8 4512 0.0 1.0000 8 7351 2.0 0.6667 8 7473 1.0 1.0000 8 10589 0.0 0.6623 8 12139 0.0 1.0000 8 12845 0.0 1.0000 8 12988 2.0 0.6557 8 14702 2.0 1.0000 8 17727 0.0 1.0000 8 No. of records with gender 0 in cluster 8 is 5 No. of records with gender 1 in cluster 8 is 1 No. of records with gender 2 in cluster 8 is 4 Records found in cluster 9 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 570 2.0 0.6616 9 3168 1.0 1.0000 9 11317 2.0 1.0000 9 11909 1.0 1.0000 9 14448 0.0 1.0000 9 14613 0.0 1.0000 9 14791 1.0 1.0000 9 15015 1.0 1.0000 9 15216 0.0 1.0000 9 No. of records with gender 0 in cluster 9 is 3 No. of records with gender 1 in cluster 9 is 4 No. of records with gender 2 in cluster 9 is 2 Records found in cluster 10 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 575 0.0 1.0000 10 1308 0.0 0.6479 10 2033 1.0 1.0000 10 2308 1.0 0.6774 10 3898 0.0 1.0000 10 5454 2.0 0.6774 10 5539 1.0 1.0000 10 5628 2.0 1.0000 10 5825 1.0 1.0000 10 5847 2.0 0.6717 10 6012 0.0 1.0000 10 6048 2.0 0.6796 10 6108 0.0 1.0000 10 6114 1.0 0.6620 10 6335 2.0 1.0000 10 6382 2.0 0.6842 10 6417 2.0 1.0000 10 7843 2.0 1.0000 10 8181 0.0 1.0000 10 8355 2.0 0.6778 10 8738 0.0 1.0000 10 No. of records with gender 0 in cluster 10 is 7 No. of records with gender 1 in cluster 10 is 5 No. of records with gender 2 in cluster 10 is 9 Records found in cluster 11 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 599 1.0 1.0000 11 1268 2.0 1.0000 11 2138 1.0 1.0000 11 2145 0.0 1.0000 11 2146 1.0 1.0000 11 2147 1.0 1.0000 11 2148 1.0 0.3576 11 2156 0.0 1.0000 11 2166 1.0 1.0000 11 2168 0.0 0.6825 11 2169 1.0 1.0000 11 2171 1.0 1.0000 11 2172 0.0 1.0000 11 2182 2.0 1.0000 11 2185 0.0 1.0000 11 2186 0.0 0.3403 11 2187 1.0 1.0000 11 2188 2.0 0.6812 11 2189 0.0 0.6582 11 2191 0.0 1.0000 11 2194 1.0 1.0000 11 2196 1.0 1.0000 11 2204 1.0 0.6587 11 2205 0.0 0.6685 11 2206 1.0 0.6551 11 2207 1.0 1.0000 11 2210 1.0 1.0000 11 2216 1.0 0.6896 11 2217 1.0 0.6832 11 2220 1.0 1.0000 11 2223 2.0 1.0000 11 2682 1.0 0.6473 11 2860 0.0 1.0000 11 2862 0.0 1.0000 11 2863 0.0 0.3370 11 2866 2.0 0.6497 11 2870 2.0 0.6368 11 2872 0.0 0.6855 11 2873 1.0 0.6940 11 3360 1.0 1.0000 11 5548 2.0 1.0000 11 6616 1.0 1.0000 11 7610 2.0 0.6578 11 8509 2.0 0.6731 11 9305 2.0 0.6606 11 10714 0.0 1.0000 11 12324 1.0 1.0000 11 14170 1.0 1.0000 11 15223 0.0 1.0000 11 16735 0.0 0.6563 11 No. of records with gender 0 in cluster 11 is 16 No. of records with gender 1 in cluster 11 is 24 No. of records with gender 2 in cluster 11 is 10 Records found in cluster 12 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 822 0.0 0.6473 12 1536 2.0 0.6591 12 11119 1.0 1.0000 12 11627 2.0 0.6796 12 11727 2.0 1.0000 12 12333 1.0 1.0000 12 12992 0.0 1.0000 12 13486 2.0 1.0000 12 13980 0.0 1.0000 12 14046 0.0 1.0000 12 14958 2.0 1.0000 12 15597 1.0 0.3362 12 16706 0.0 1.0000 12 17090 0.0 1.0000 12 17186 1.0 1.0000 12 17599 0.0 0.6654 12 18270 0.0 1.0000 12 No. of records with gender 0 in cluster 12 is 8 No. of records with gender 1 in cluster 12 is 4 No. of records with gender 2 in cluster 12 is 5 Records found in cluster 13 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 941 2.0 0.6582 13 9955 2.0 1.0000 13 10078 0.0 1.0000 13 10115 0.0 1.0000 13 10194 1.0 1.0000 13 10234 2.0 0.3388 13 10298 0.0 0.3387 13 10354 2.0 0.6852 13 10391 1.0 1.0000 13 15703 1.0 1.0000 13 17106 0.0 1.0000 13 17709 0.0 1.0000 13 No. of records with gender 0 in cluster 13 is 5 No. of records with gender 1 in cluster 13 is 3 No. of records with gender 2 in cluster 13 is 4 Records found in cluster 14 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1040 1.0 1.0000 14 1045 2.0 0.6789 14 1049 1.0 1.0000 14 1051 2.0 1.0000 14 1052 1.0 1.0000 14 1054 1.0 1.0000 14 1061 0.0 1.0000 14 1064 1.0 0.6498 14 1065 0.0 1.0000 14 No. of records with gender 0 in cluster 14 is 2 No. of records with gender 1 in cluster 14 is 5 No. of records with gender 2 in cluster 14 is 2 Records found in cluster 15 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1108 1.0 0.6880 15 9382 2.0 1.0000 15 9398 1.0 1.0000 15 9475 0.0 1.0000 15 9496 0.0 1.0000 15 ... ... ... ... 15207 1.0 1.0000 15 15391 2.0 1.0000 15 15439 2.0 1.0000 15 15622 2.0 1.0000 15 18398 0.0 0.6709 15 [70 rows x 3 columns] No. of records with gender 0 in cluster 15 is 19 No. of records with gender 1 in cluster 15 is 25 No. of records with gender 2 in cluster 15 is 26 Records found in cluster 16 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1203 1.0 1.0000 16 1240 1.0 0.6889 16 2115 0.0 1.0000 16 2381 0.0 1.0000 16 3988 2.0 1.0000 16 5994 2.0 0.6611 16 7988 1.0 0.6734 16 8071 1.0 1.0000 16 10735 0.0 1.0000 16 10738 0.0 1.0000 16 11076 2.0 1.0000 16 11179 2.0 1.0000 16 11484 1.0 1.0000 16 11648 1.0 1.0000 16 11746 0.0 1.0000 16 12054 1.0 1.0000 16 13078 0.0 1.0000 16 14056 2.0 1.0000 16 15064 0.0 0.6534 16 15751 1.0 1.0000 16 15757 1.0 1.0000 16 16465 0.0 1.0000 16 16868 1.0 1.0000 16 17448 0.0 1.0000 16 18208 0.0 1.0000 16 18753 0.0 0.6678 16 No. of records with gender 0 in cluster 16 is 11 No. of records with gender 1 in cluster 16 is 10 No. of records with gender 2 in cluster 16 is 5 Records found in cluster 17 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1273 0.0 1.0000 17 1605 2.0 1.0000 17 1761 2.0 1.0000 17 1845 1.0 1.0000 17 1987 1.0 1.0000 17 2274 0.0 1.0000 17 3961 0.0 1.0000 17 4092 0.0 0.3411 17 4424 2.0 1.0000 17 5218 2.0 1.0000 17 5336 1.0 1.0000 17 5445 0.0 1.0000 17 5927 2.0 0.6721 17 5980 0.0 1.0000 17 6262 2.0 1.0000 17 6289 1.0 1.0000 17 7003 1.0 1.0000 17 7118 2.0 1.0000 17 7431 1.0 1.0000 17 7540 0.0 0.6859 17 7791 1.0 1.0000 17 8142 2.0 1.0000 17 8601 2.0 0.6700 17 8693 0.0 1.0000 17 9023 1.0 0.6654 17 9265 1.0 1.0000 17 No. of records with gender 0 in cluster 17 is 8 No. of records with gender 1 in cluster 17 is 9 No. of records with gender 2 in cluster 17 is 9 Records found in cluster 18 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1367 1.0 1.0000 18 2382 1.0 1.0000 18 2897 2.0 1.0000 18 3526 1.0 1.0000 18 4051 2.0 1.0000 18 6140 2.0 0.6679 18 7107 2.0 0.6865 18 7913 2.0 1.0000 18 8836 0.0 0.6645 18 No. of records with gender 0 in cluster 18 is 1 No. of records with gender 1 in cluster 18 is 3 No. of records with gender 2 in cluster 18 is 5 Records found in cluster 19 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1544 0.0 1.0000 19 2154 1.0 0.6561 19 3341 1.0 1.0000 19 3938 2.0 0.6545 19 4650 2.0 0.3571 19 5424 0.0 1.0000 19 6313 1.0 1.0000 19 8798 1.0 1.0000 19 No. of records with gender 0 in cluster 19 is 2 No. of records with gender 1 in cluster 19 is 4 No. of records with gender 2 in cluster 19 is 2 Records found in cluster 20 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 1844 2.0 1.0000 20 4712 0.0 1.0000 20 5611 2.0 0.6856 20 6066 2.0 0.6668 20 6133 0.0 0.6655 20 6204 0.0 1.0000 20 6291 2.0 1.0000 20 6299 0.0 0.3604 20 6478 2.0 0.6611 20 6668 0.0 1.0000 20 6786 2.0 0.6694 20 7058 1.0 1.0000 20 7102 0.0 1.0000 20 7130 2.0 1.0000 20 7158 1.0 1.0000 20 7176 1.0 1.0000 20 7210 0.0 0.6617 20 7228 2.0 0.6766 20 7259 0.0 1.0000 20 7300 1.0 1.0000 20 7304 1.0 1.0000 20 7332 2.0 0.6573 20 7417 1.0 1.0000 20 7441 1.0 1.0000 20 7502 1.0 0.6617 20 7507 0.0 0.6848 20 7629 2.0 1.0000 20 7697 1.0 1.0000 20 7738 2.0 1.0000 20 7751 2.0 1.0000 20 7759 2.0 1.0000 20 7830 1.0 1.0000 20 7908 2.0 1.0000 20 7975 0.0 1.0000 20 7977 2.0 0.6739 20 7980 2.0 1.0000 20 7987 0.0 1.0000 20 8165 0.0 1.0000 20 8236 0.0 1.0000 20 8264 0.0 1.0000 20 8333 2.0 1.0000 20 8884 1.0 0.6612 20 8947 2.0 1.0000 20 8951 0.0 0.6752 20 9028 1.0 0.6849 20 9225 2.0 1.0000 20 9249 1.0 0.3542 20 11400 1.0 1.0000 20 No. of records with gender 0 in cluster 20 is 15 No. of records with gender 1 in cluster 20 is 14 No. of records with gender 2 in cluster 20 is 19 Records found in cluster 21 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 2445 1.0 1.0000 21 4210 2.0 1.0000 21 4595 1.0 1.0000 21 4621 1.0 1.0000 21 4685 2.0 1.0000 21 ... ... ... ... 15313 2.0 1.0000 21 15316 2.0 1.0000 21 15322 0.0 1.0000 21 15324 2.0 0.6344 21 15338 1.0 0.6791 21 [124 rows x 3 columns] No. of records with gender 0 in cluster 21 is 28 No. of records with gender 1 in cluster 21 is 34 No. of records with gender 2 in cluster 21 is 62 Records found in cluster 22 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 3385 1.0 1.0000 22 3386 1.0 0.6628 22 3388 2.0 1.0000 22 3391 0.0 0.6612 22 3393 1.0 1.0000 22 3394 1.0 1.0000 22 3396 1.0 1.0000 22 3397 0.0 1.0000 22 3398 2.0 1.0000 22 3400 1.0 0.6727 22 3401 2.0 1.0000 22 3402 0.0 1.0000 22 3406 0.0 0.6819 22 3407 1.0 1.0000 22 3411 0.0 1.0000 22 3412 1.0 1.0000 22 3413 1.0 0.7023 22 No. of records with gender 0 in cluster 22 is 5 No. of records with gender 1 in cluster 22 is 9 No. of records with gender 2 in cluster 22 is 3 Records found in cluster 23 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 3581 0.0 1.0000 23 3705 2.0 0.6581 23 3809 2.0 1.0000 23 3906 1.0 0.6422 23 4041 0.0 1.0000 23 4108 2.0 1.0000 23 4111 2.0 1.0000 23 4113 1.0 1.0000 23 4114 1.0 1.0000 23 4116 1.0 1.0000 23 4117 0.0 1.0000 23 4121 0.0 1.0000 23 4134 0.0 0.6692 23 4135 0.0 0.3619 23 4136 2.0 1.0000 23 4137 1.0 1.0000 23 4138 0.0 1.0000 23 4152 2.0 1.0000 23 4153 0.0 1.0000 23 4154 1.0 1.0000 23 4156 1.0 1.0000 23 4272 2.0 1.0000 23 4341 0.0 1.0000 23 4410 2.0 1.0000 23 4508 1.0 1.0000 23 4631 2.0 1.0000 23 4736 2.0 1.0000 23 4840 2.0 1.0000 23 5305 1.0 1.0000 23 No. of records with gender 0 in cluster 23 is 9 No. of records with gender 1 in cluster 23 is 9 No. of records with gender 2 in cluster 23 is 11 Records found in cluster 24 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 3744 0.0 0.6440 24 3927 0.0 1.0000 24 3994 1.0 1.0000 24 4057 2.0 0.3516 24 4300 2.0 0.6736 24 4398 1.0 1.0000 24 4470 2.0 0.6602 24 4544 0.0 1.0000 24 4640 2.0 1.0000 24 4800 2.0 0.6575 24 4883 2.0 1.0000 24 5043 1.0 1.0000 24 5238 1.0 1.0000 24 5325 1.0 0.6645 24 5515 2.0 1.0000 24 5659 1.0 1.0000 24 5978 2.0 1.0000 24 6188 2.0 0.6748 24 6440 2.0 1.0000 24 6562 0.0 1.0000 24 6671 2.0 1.0000 24 6749 1.0 1.0000 24 6826 2.0 0.6933 24 7050 0.0 0.6736 24 No. of records with gender 0 in cluster 24 is 5 No. of records with gender 1 in cluster 24 is 7 No. of records with gender 2 in cluster 24 is 12 Records found in cluster 25 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4012 1.0 1.0000 25 4097 0.0 0.6706 25 4177 0.0 0.6729 25 4219 0.0 1.0000 25 4226 2.0 1.0000 25 ... ... ... ... 5777 2.0 0.6638 25 5809 0.0 1.0000 25 5849 0.0 0.6792 25 5881 2.0 1.0000 25 5910 0.0 0.6787 25 [94 rows x 3 columns] No. of records with gender 0 in cluster 25 is 33 No. of records with gender 1 in cluster 25 is 23 No. of records with gender 2 in cluster 25 is 38 Records found in cluster 26 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4498 0.0 1.0000 26 6783 2.0 1.0000 26 10814 0.0 1.0000 26 14468 1.0 1.0000 26 14630 1.0 1.0000 26 14664 2.0 1.0000 26 14804 1.0 1.0000 26 15040 1.0 1.0000 26 15267 1.0 0.6608 26 16204 1.0 1.0000 26 No. of records with gender 0 in cluster 26 is 2 No. of records with gender 1 in cluster 26 is 6 No. of records with gender 2 in cluster 26 is 2 Records found in cluster 27 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4572 1.0 1.0000 27 4606 0.0 1.0000 27 4627 2.0 1.0000 27 4690 0.0 0.6763 27 4746 1.0 1.0000 27 ... ... ... ... 8052 0.0 0.7050 27 8391 0.0 1.0000 27 8411 1.0 1.0000 27 18789 0.0 1.0000 27 18803 1.0 1.0000 27 [148 rows x 3 columns] No. of records with gender 0 in cluster 27 is 46 No. of records with gender 1 in cluster 27 is 36 No. of records with gender 2 in cluster 27 is 66 Records found in cluster 28 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4772 2.0 1.0000 28 4789 2.0 1.0000 28 4853 0.0 1.0000 28 4917 1.0 0.6571 28 4949 2.0 1.0000 28 ... ... ... ... 9206 2.0 0.3398 28 9215 1.0 0.6818 28 9253 2.0 1.0000 28 9278 1.0 1.0000 28 9294 0.0 1.0000 28 [127 rows x 3 columns] No. of records with gender 0 in cluster 28 is 31 No. of records with gender 1 in cluster 28 is 31 No. of records with gender 2 in cluster 28 is 65 Records found in cluster 29 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4965 2.0 0.6695 29 5384 2.0 1.0000 29 5485 2.0 1.0000 29 5683 2.0 1.0000 29 5800 1.0 1.0000 29 7510 0.0 1.0000 29 8081 2.0 1.0000 29 8479 2.0 0.3625 29 8557 0.0 1.0000 29 8655 1.0 1.0000 29 8987 2.0 1.0000 29 9070 0.0 1.0000 29 9289 2.0 1.0000 29 9313 2.0 0.6841 29 10058 2.0 1.0000 29 10070 1.0 1.0000 29 10084 0.0 1.0000 29 10092 1.0 1.0000 29 10102 2.0 1.0000 29 10116 2.0 1.0000 29 10131 0.0 1.0000 29 10143 2.0 1.0000 29 10167 1.0 0.3495 29 11175 0.0 1.0000 29 No. of records with gender 0 in cluster 29 is 6 No. of records with gender 1 in cluster 29 is 5 No. of records with gender 2 in cluster 29 is 13 Records found in cluster 30 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 4995 2.0 1.0000 30 5372 2.0 1.0000 30 5627 2.0 0.6559 30 5919 2.0 1.0000 30 6208 1.0 0.6543 30 6496 2.0 0.6716 30 7060 1.0 0.6890 30 7439 0.0 1.0000 30 7683 1.0 0.6699 30 7894 0.0 1.0000 30 7902 0.0 1.0000 30 8408 0.0 1.0000 30 8933 1.0 1.0000 30 10448 2.0 0.6544 30 No. of records with gender 0 in cluster 30 is 4 No. of records with gender 1 in cluster 30 is 4 No. of records with gender 2 in cluster 30 is 6 Records found in cluster 31 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 5147 1.0 1.0000 31 17729 0.0 1.0000 31 17730 0.0 1.0000 31 17731 2.0 1.0000 31 17733 0.0 1.0000 31 17734 1.0 1.0000 31 17737 0.0 1.0000 31 17739 1.0 1.0000 31 17741 0.0 1.0000 31 17761 0.0 1.0000 31 17766 1.0 1.0000 31 17767 2.0 0.6637 31 17768 1.0 1.0000 31 17823 2.0 1.0000 31 17827 1.0 0.6773 31 17874 2.0 1.0000 31 17875 1.0 1.0000 31 17898 1.0 1.0000 31 17901 0.0 0.6676 31 17928 0.0 1.0000 31 17931 0.0 1.0000 31 17932 1.0 1.0000 31 17933 1.0 0.6807 31 17956 0.0 1.0000 31 17962 2.0 1.0000 31 17969 0.0 1.0000 31 17974 0.0 1.0000 31 17975 1.0 1.0000 31 17991 1.0 1.0000 31 18042 1.0 1.0000 31 18047 1.0 1.0000 31 18049 2.0 1.0000 31 18052 0.0 0.6660 31 18055 1.0 1.0000 31 18057 1.0 0.6557 31 18062 1.0 1.0000 31 18117 0.0 0.6664 31 18118 0.0 1.0000 31 18124 0.0 1.0000 31 18170 0.0 1.0000 31 18175 1.0 1.0000 31 18215 2.0 0.6545 31 18218 1.0 1.0000 31 18228 1.0 1.0000 31 18229 0.0 0.6827 31 18230 2.0 1.0000 31 18231 0.0 1.0000 31 18233 1.0 0.3352 31 18236 0.0 1.0000 31 18354 0.0 1.0000 31 18368 1.0 1.0000 31 18371 2.0 1.0000 31 18373 0.0 1.0000 31 18374 1.0 1.0000 31 No. of records with gender 0 in cluster 31 is 22 No. of records with gender 1 in cluster 31 is 23 No. of records with gender 2 in cluster 31 is 9 Records found in cluster 32 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 5206 1.0 1.0000 32 5629 2.0 1.0000 32 5640 0.0 1.0000 32 5944 1.0 1.0000 32 6093 1.0 0.6653 32 6157 2.0 0.6567 32 6174 2.0 0.6619 32 6409 0.0 1.0000 32 6514 1.0 1.0000 32 13356 1.0 1.0000 32 No. of records with gender 0 in cluster 32 is 2 No. of records with gender 1 in cluster 32 is 5 No. of records with gender 2 in cluster 32 is 3 Records found in cluster 33 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 5665 1.0 1.0000 33 6109 0.0 1.0000 33 6206 1.0 1.0000 33 6381 1.0 1.0000 33 6390 1.0 1.0000 33 6502 0.0 1.0000 33 6576 2.0 1.0000 33 6580 2.0 1.0000 33 6664 2.0 1.0000 33 6685 2.0 1.0000 33 6789 2.0 1.0000 33 6858 1.0 1.0000 33 6876 0.0 1.0000 33 6992 2.0 1.0000 33 7040 1.0 1.0000 33 7043 0.0 1.0000 33 7065 2.0 1.0000 33 7109 1.0 1.0000 33 7148 0.0 0.6750 33 7273 1.0 1.0000 33 7399 0.0 0.3272 33 7421 2.0 0.6802 33 7430 2.0 0.6812 33 7440 0.0 1.0000 33 7581 1.0 1.0000 33 7586 0.0 1.0000 33 7611 0.0 0.6666 33 7614 2.0 0.6866 33 7622 2.0 1.0000 33 7626 0.0 1.0000 33 7655 1.0 1.0000 33 7669 2.0 1.0000 33 7679 1.0 1.0000 33 7705 1.0 1.0000 33 7757 2.0 1.0000 33 7793 0.0 0.6691 33 7817 0.0 1.0000 33 7820 0.0 1.0000 33 7827 2.0 0.3472 33 7888 2.0 0.6506 33 7897 0.0 0.6803 33 7959 0.0 0.6823 33 8033 0.0 0.6701 33 8055 0.0 1.0000 33 8062 1.0 1.0000 33 8118 1.0 1.0000 33 8177 2.0 1.0000 33 8251 0.0 0.6624 33 8358 2.0 0.6965 33 8385 1.0 1.0000 33 8466 0.0 1.0000 33 8470 1.0 1.0000 33 No. of records with gender 0 in cluster 33 is 19 No. of records with gender 1 in cluster 33 is 16 No. of records with gender 2 in cluster 33 is 17 Records found in cluster 34 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 5853 2.0 0.6619 34 6244 2.0 1.0000 34 8255 2.0 0.6672 34 9773 0.0 0.6607 34 10211 1.0 1.0000 34 10698 1.0 0.6795 34 12736 1.0 0.6619 34 14216 1.0 1.0000 34 14307 2.0 0.6617 34 15333 1.0 1.0000 34 15424 0.0 0.6608 34 15800 1.0 1.0000 34 16873 1.0 1.0000 34 17596 1.0 1.0000 34 18337 1.0 1.0000 34 No. of records with gender 0 in cluster 34 is 2 No. of records with gender 1 in cluster 34 is 9 No. of records with gender 2 in cluster 34 is 4 Records found in cluster 35 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 6080 1.0 1.0000 35 7002 2.0 1.0000 35 7016 0.0 1.0000 35 7091 1.0 0.6642 35 7095 2.0 1.0000 35 ... ... ... ... 9150 1.0 1.0000 35 9165 0.0 1.0000 35 9216 2.0 0.6519 35 9221 2.0 1.0000 35 9243 0.0 0.3506 35 [62 rows x 3 columns] No. of records with gender 0 in cluster 35 is 13 No. of records with gender 1 in cluster 35 is 20 No. of records with gender 2 in cluster 35 is 29 Records found in cluster 36 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 7289 0.0 1.0000 36 12796 1.0 1.0000 36 13303 1.0 1.0000 36 13417 1.0 1.0000 36 13502 1.0 1.0000 36 13716 1.0 0.6830 36 13901 2.0 0.6611 36 14140 0.0 0.6645 36 14214 2.0 1.0000 36 14269 2.0 0.6868 36 14337 1.0 1.0000 36 14412 1.0 1.0000 36 14483 0.0 1.0000 36 14645 1.0 1.0000 36 15443 2.0 1.0000 36 15534 0.0 1.0000 36 15807 0.0 1.0000 36 15916 1.0 1.0000 36 16188 1.0 1.0000 36 16418 2.0 1.0000 36 16672 1.0 1.0000 36 16725 1.0 1.0000 36 17269 0.0 1.0000 36 17351 1.0 0.6556 36 17442 1.0 1.0000 36 17842 0.0 1.0000 36 18412 2.0 0.6690 36 18510 1.0 1.0000 36 18731 1.0 1.0000 36 18738 2.0 1.0000 36 No. of records with gender 0 in cluster 36 is 7 No. of records with gender 1 in cluster 36 is 16 No. of records with gender 2 in cluster 36 is 7 Records found in cluster 37 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 7381 2.0 1.0000 37 7470 1.0 0.6810 37 7542 0.0 1.0000 37 7616 2.0 0.6675 37 7675 2.0 1.0000 37 7744 2.0 0.6761 37 7795 1.0 0.6602 37 7871 2.0 1.0000 37 7946 1.0 1.0000 37 8010 1.0 1.0000 37 8069 1.0 1.0000 37 8125 1.0 1.0000 37 8180 1.0 0.6850 37 8253 2.0 1.0000 37 8395 1.0 1.0000 37 8477 1.0 1.0000 37 8532 1.0 1.0000 37 8587 2.0 1.0000 37 8657 1.0 1.0000 37 8755 0.0 0.6707 37 8810 0.0 1.0000 37 8906 1.0 0.7047 37 8977 1.0 1.0000 37 9039 1.0 1.0000 37 9101 0.0 0.3496 37 9172 0.0 1.0000 37 9247 2.0 0.6622 37 9317 0.0 1.0000 37 17122 2.0 0.6583 37 No. of records with gender 0 in cluster 37 is 6 No. of records with gender 1 in cluster 37 is 14 No. of records with gender 2 in cluster 37 is 9 Records found in cluster 38 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 9515 0.0 0.6648 38 10396 1.0 1.0000 38 10608 1.0 1.0000 38 10796 0.0 0.6912 38 10981 0.0 1.0000 38 11477 2.0 1.0000 38 11770 2.0 1.0000 38 12451 2.0 1.0000 38 12803 1.0 0.6667 38 12996 1.0 1.0000 38 13263 2.0 0.6743 38 13436 0.0 1.0000 38 14141 0.0 1.0000 38 14290 0.0 1.0000 38 14473 0.0 1.0000 38 14878 2.0 0.6502 38 15088 0.0 0.6581 38 15727 2.0 1.0000 38 16605 0.0 0.6578 38 16973 0.0 1.0000 38 17197 1.0 1.0000 38 17330 0.0 1.0000 38 17728 1.0 0.6702 38 18071 2.0 1.0000 38 18531 2.0 1.0000 38 No. of records with gender 0 in cluster 38 is 11 No. of records with gender 1 in cluster 38 is 6 No. of records with gender 2 in cluster 38 is 8 Records found in cluster 39 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 9856 2.0 1.0000 39 10008 0.0 1.0000 39 10075 1.0 1.0000 39 10150 1.0 1.0000 39 10237 2.0 1.0000 39 10318 2.0 1.0000 39 10385 1.0 0.3592 39 10471 1.0 1.0000 39 10633 2.0 0.6545 39 10716 0.0 0.6794 39 10776 0.0 1.0000 39 10849 2.0 1.0000 39 10964 2.0 1.0000 39 11050 0.0 1.0000 39 11118 2.0 0.6666 39 11190 1.0 1.0000 39 11251 1.0 0.6715 39 11356 2.0 1.0000 39 11429 2.0 1.0000 39 11502 2.0 1.0000 39 11590 0.0 1.0000 39 11653 0.0 1.0000 39 11767 2.0 1.0000 39 11842 1.0 1.0000 39 11930 1.0 1.0000 39 12045 1.0 1.0000 39 12132 1.0 0.6858 39 12195 0.0 0.6564 39 12284 1.0 1.0000 39 12397 0.0 1.0000 39 12507 2.0 1.0000 39 12659 2.0 1.0000 39 12754 2.0 0.6615 39 No. of records with gender 0 in cluster 39 is 8 No. of records with gender 1 in cluster 39 is 11 No. of records with gender 2 in cluster 39 is 14 Records found in cluster 40 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 10710 1.0 1.0000 40 11127 2.0 1.0000 40 11929 0.0 1.0000 40 12857 0.0 1.0000 40 12921 1.0 1.0000 40 12962 1.0 1.0000 40 13047 2.0 0.6509 40 13110 0.0 0.6414 40 13132 0.0 0.6527 40 13159 2.0 1.0000 40 13221 1.0 1.0000 40 13254 0.0 0.6618 40 13289 0.0 0.6711 40 17886 2.0 1.0000 40 No. of records with gender 0 in cluster 40 is 6 No. of records with gender 1 in cluster 40 is 4 No. of records with gender 2 in cluster 40 is 4 Records found in cluster 41 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 10888 1.0 1.0000 41 12233 2.0 0.3374 41 13608 1.0 1.0000 41 14053 0.0 1.0000 41 14500 2.0 0.3449 41 15128 1.0 1.0000 41 15717 2.0 1.0000 41 16776 2.0 1.0000 41 No. of records with gender 0 in cluster 41 is 1 No. of records with gender 1 in cluster 41 is 3 No. of records with gender 2 in cluster 41 is 4 Records found in cluster 42 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 16388 2.0 1.0000 42 17041 1.0 1.0000 42 17154 1.0 1.0000 42 17297 0.0 1.0000 42 17565 1.0 1.0000 42 17677 1.0 1.0000 42 17868 2.0 0.3354 42 18092 0.0 1.0000 42 18246 1.0 1.0000 42 18399 0.0 1.0000 42 18527 1.0 1.0000 42 18646 0.0 1.0000 42 18759 0.0 0.6386 42 No. of records with gender 0 in cluster 42 is 5 No. of records with gender 1 in cluster 42 is 6 No. of records with gender 2 in cluster 42 is 2 Records found in cluster 43 from DBSCAN in Exp 1 gender gender:confidence Cluster_Label 17732 2.0 0.3417 43 17735 0.0 1.0000 43 17736 0.0 1.0000 43 17738 1.0 1.0000 43 17740 1.0 1.0000 43 ... ... ... ... 18367 2.0 1.0000 43 18369 2.0 1.0000 43 18370 0.0 0.6591 43 18372 2.0 1.0000 43 18375 0.0 1.0000 43 [98 rows x 3 columns] No. of records with gender 0 in cluster 43 is 44 No. of records with gender 1 in cluster 43 is 35 No. of records with gender 2 in cluster 43 is 19 Records classified as noise Empty DataFrame Columns: [gender, gender:confidence, Cluster_Label] Index: [] ================================================== EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES ================================================== Data with Only Numerical and Categorical Features <class 'pandas.core.frame.DataFrame'> Index: 19970 entries, 0 to 18833 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 retweets_per_day 18836 non-null float64 1 favorites_per_day 18836 non-null float64 2 tweets_per_day 18836 non-null float64 3 profile_created_year 18836 non-null float64 4 tweet_created_year 18836 non-null float64 5 tweet_location_encoded 18836 non-null float64 6 user_timezone_encoded 18836 non-null float64 7 gender 18836 non-null float64 8 gender:confidence 18836 non-null float64 dtypes: float64(9) memory usage: 1.5 MB None Removing NaN values... Dropping gender and gender:confidence... Dataset for Exp 2 <class 'pandas.core.frame.DataFrame'> Index: 17702 entries, 0 to 18835 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 retweets_per_day 17702 non-null float64 1 favorites_per_day 17702 non-null float64 2 tweets_per_day 17702 non-null float64 3 profile_created_year 17702 non-null float64 4 tweet_created_year 17702 non-null float64 5 tweet_location_encoded 17702 non-null float64 6 user_timezone_encoded 17702 non-null float64 dtypes: float64(7) memory usage: 1.1 MB None retweets_per_day favorites_per_day tweets_per_day profile_created_year \ 0 -0.100504 -0.318861 1.467429 0.497680 1 -0.100504 -0.313379 -0.582882 0.028171 2 9.949874 0.437997 -0.593862 0.967189 3 -0.100504 -0.306100 -0.691862 -1.380358 4 -0.100504 3.133457 -0.075048 0.967189 tweet_created_year tweet_location_encoded user_timezone_encoded 0 0.0 0.000053 0.001699 1 0.0 0.363294 0.127309 2 0.0 0.000053 0.002071 3 0.0 0.000159 0.105755 4 0.0 0.363294 0.381344 Applying UMAP for dim reduction...
[I 2024-09-20 16:23:48,010] A new study created in memory with name: no-name-f9550dca-d1e6-48ba-a676-9e841b5672d2
(17702, 3) Performing K-Means Clustering...
[I 2024-09-20 16:23:51,598] Trial 0 finished with value: 0.3289636639097889 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.3289636639097889. [I 2024-09-20 16:23:55,202] Trial 1 finished with value: 0.37002461668723785 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 1 with value: 0.37002461668723785. [I 2024-09-20 16:23:58,694] Trial 2 finished with value: 0.34236426949671833 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 1 with value: 0.37002461668723785. [I 2024-09-20 16:24:02,334] Trial 3 finished with value: 0.42740275911790543 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543. [I 2024-09-20 16:24:05,980] Trial 4 finished with value: 0.3327138746593811 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543. [I 2024-09-20 16:24:09,807] Trial 5 finished with value: 0.43672661408383706 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 5 with value: 0.43672661408383706. [I 2024-09-20 16:24:13,417] Trial 6 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 5 with value: 0.43672661408383706. [I 2024-09-20 16:24:17,670] Trial 7 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:21,464] Trial 8 finished with value: 0.4278636679375091 and parameters: {'n_clusters': 5, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:25,371] Trial 9 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:29,537] Trial 10 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:33,153] Trial 11 finished with value: 0.3527991920640622 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:37,183] Trial 12 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:41,210] Trial 13 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645. [I 2024-09-20 16:24:44,823] Trial 14 finished with value: 0.35844394490438636 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
Best params: {'n_clusters': 2, 'init': 'random'}
[I 2024-09-20 16:24:49,788] A new study created in memory with name: no-name-9347cc24-2c6f-4ad3-b792-5f023401ccdb
The Silhouette score is 0.7076370412066645 The Callinski index is 4482.755124226919 Dataset with Labels from KMeans in Exp 2 gender gender:confidence Cluster_Label 0 0.0 1.0000 1 1 0.0 1.0000 1 2 0.0 0.6625 0 3 0.0 1.0000 1 4 1.0 1.0000 1 Records found in cluster 0 from KMeans in Exp 2 gender gender:confidence Cluster_Label 2 0.0 0.6625 0 257 1.0 1.0000 0 286 2.0 1.0000 0 392 2.0 0.6576 0 429 1.0 1.0000 0 ... ... ... ... 18649 0.0 1.0000 0 18720 0.0 1.0000 0 18765 1.0 1.0000 0 18784 2.0 1.0000 0 18796 0.0 0.6760 0 [371 rows x 3 columns] No. of records with gender 0 in cluster 0 is 135 No. of records with gender 1 in cluster 0 is 103 No. of records with gender 2 in cluster 0 is 133 Records found in cluster 1 from KMeans in Exp 2 gender gender:confidence Cluster_Label 0 0.0 1.0000 1 1 0.0 1.0000 1 3 0.0 1.0000 1 4 1.0 1.0000 1 5 1.0 1.0000 1 ... ... ... ... 18829 1.0 1.0000 1 18831 0.0 0.6466 1 18832 1.0 1.0000 1 18834 1.0 1.0000 1 18835 0.0 0.6772 1 [17331 rows x 3 columns] No. of records with gender 0 in cluster 1 is 5708 No. of records with gender 1 in cluster 1 is 6098 No. of records with gender 2 in cluster 1 is 5525 Performing DBSCAN Clustering...
[I 2024-09-20 16:24:59,626] Trial 0 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8612774559273246, 'min_samples': 17}. Best is trial 0 with value: 0.7395551461504506. [I 2024-09-20 16:25:09,593] Trial 1 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8222959595129316, 'min_samples': 16}. Best is trial 0 with value: 0.7395551461504506. [I 2024-09-20 16:25:19,610] Trial 2 finished with value: 0.7286394402690954 and parameters: {'eps': 1.4969908447691442, 'min_samples': 15}. Best is trial 0 with value: 0.7395551461504506. [I 2024-09-20 16:25:27,226] Trial 3 finished with value: 0.5245068941307643 and parameters: {'eps': 0.572252884608168, 'min_samples': 14}. Best is trial 0 with value: 0.7395551461504506. [I 2024-09-20 16:25:37,404] Trial 4 finished with value: 0.7535863974003295 and parameters: {'eps': 1.9348801547784897, 'min_samples': 14}. Best is trial 4 with value: 0.7535863974003295. [I 2024-09-20 16:25:47,748] Trial 5 finished with value: 0.7378543022519933 and parameters: {'eps': 1.6937724424520773, 'min_samples': 17}. Best is trial 4 with value: 0.7535863974003295. [I 2024-09-20 16:25:57,605] Trial 6 finished with value: 0.7283183555413514 and parameters: {'eps': 1.8007992794733476, 'min_samples': 5}. Best is trial 4 with value: 0.7535863974003295. [I 2024-09-20 16:26:05,851] Trial 7 finished with value: 0.41821414951083047 and parameters: {'eps': 0.7293348370290341, 'min_samples': 7}. Best is trial 4 with value: 0.7535863974003295. [I 2024-09-20 16:26:14,792] Trial 8 finished with value: 0.7014382428001366 and parameters: {'eps': 1.0030585304662154, 'min_samples': 9}. Best is trial 4 with value: 0.7535863974003295. [I 2024-09-20 16:26:24,590] Trial 9 finished with value: 0.7561451190368602 and parameters: {'eps': 1.871623880808503, 'min_samples': 8}. Best is trial 9 with value: 0.7561451190368602. [I 2024-09-20 16:26:33,948] Trial 10 finished with value: 0.5090875645314461 and parameters: {'eps': 1.2834097282566614, 'min_samples': 3}. Best is trial 9 with value: 0.7561451190368602. [I 2024-09-20 16:26:43,291] Trial 11 finished with value: 0.7175191411766557 and parameters: {'eps': 1.2828100669180271, 'min_samples': 11}. Best is trial 9 with value: 0.7561451190368602. [I 2024-09-20 16:26:53,530] Trial 12 finished with value: 0.7582461095915987 and parameters: {'eps': 1.9873182476645224, 'min_samples': 12}. Best is trial 12 with value: 0.7582461095915987. [I 2024-09-20 16:26:58,050] Trial 13 finished with value: -0.4771117854083832 and parameters: {'eps': 0.133931940094827, 'min_samples': 11}. Best is trial 12 with value: 0.7582461095915987. [I 2024-09-20 16:27:07,751] Trial 14 finished with value: 0.727389258590315 and parameters: {'eps': 1.5756102900269209, 'min_samples': 20}. Best is trial 12 with value: 0.7582461095915987.
Found best params: {'eps': 1.9873182476645224, 'min_samples': 12}
The Silhouette score is 0.7582461095915987 The Callinski index is 336.17121436944564 Dataset with Labels from DBSCAN in Exp 2 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 Records found in cluster 0 from DBSCAN in Exp 2 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 ... ... ... ... 18829 1.0 1.0000 0 18831 0.0 0.6466 0 18832 1.0 1.0000 0 18834 1.0 1.0000 0 18835 0.0 0.6772 0 [17677 rows x 3 columns] No. of records with gender 0 in cluster 0 is 5832 No. of records with gender 1 in cluster 0 is 6195 No. of records with gender 2 in cluster 0 is 5650 Records classified as noise gender gender:confidence Cluster_Label 1116 2.0 1.0000 -1 2115 0.0 1.0000 -1 2502 0.0 0.6785 -1 2869 2.0 0.6489 -1 3301 0.0 1.0000 -1 4127 2.0 1.0000 -1 4150 1.0 1.0000 -1 5613 1.0 1.0000 -1 6722 1.0 1.0000 -1 7666 2.0 1.0000 -1 9210 0.0 1.0000 -1 10926 0.0 0.6513 -1 12010 0.0 1.0000 -1 12504 0.0 1.0000 -1 12668 0.0 1.0000 -1 13204 1.0 1.0000 -1 13331 1.0 1.0000 -1 13788 1.0 1.0000 -1 14567 2.0 1.0000 -1 15940 0.0 1.0000 -1 16326 2.0 0.3515 -1 17960 0.0 1.0000 -1 18012 0.0 1.0000 -1 18585 2.0 1.0000 -1 18763 2.0 1.0000 -1 ================================================== EXP 3: USING ONLY TEXT FEATURES ================================================== Dataset for Exp 3 <class 'pandas.core.frame.DataFrame'> Index: 17702 entries, 0 to 18835 Columns: 3000 entries, desc_0 to text_1499 dtypes: float64(3000) memory usage: 405.3 MB None desc_0 desc_1 desc_2 desc_3 desc_4 desc_5 desc_6 desc_7 desc_8 \ 0 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0 desc_9 ... text_1490 text_1491 text_1492 text_1493 text_1494 \ 0 0.0 ... -0.142855 0.0 0.0 0.0 0.0 1 0.0 ... -0.142855 0.0 0.0 0.0 0.0 2 0.0 ... -0.142855 0.0 0.0 0.0 0.0 3 0.0 ... -0.142855 0.0 0.0 0.0 0.0 4 0.0 ... -0.142855 0.0 0.0 0.0 0.0 text_1495 text_1496 text_1497 text_1498 text_1499 0 -0.142733 -0.100504 0.0 0.0 0.0 1 -0.142733 -0.100504 0.0 0.0 0.0 2 -0.142733 -0.100504 0.0 0.0 0.0 3 -0.142733 -0.100504 0.0 0.0 0.0 4 -0.142733 -0.100504 0.0 0.0 0.0 [5 rows x 3000 columns] Applying UMAP for dim reduction...
[I 2024-09-20 16:30:24,130] A new study created in memory with name: no-name-36a03b0d-863a-4fa2-a02e-19e458c477ae
Performing K-Means Clustering...
[I 2024-09-20 16:30:29,152] Trial 0 finished with value: 0.3791390061378479 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 0 with value: 0.3791390061378479. [I 2024-09-20 16:30:34,447] Trial 1 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396. [I 2024-09-20 16:30:39,434] Trial 2 finished with value: 0.37424296140670776 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396. [I 2024-09-20 16:30:44,536] Trial 3 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396. [I 2024-09-20 16:30:49,831] Trial 4 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:30:54,927] Trial 5 finished with value: 0.4103359282016754 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:00,242] Trial 6 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:05,516] Trial 7 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:10,792] Trial 8 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:15,863] Trial 9 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:21,412] Trial 10 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:26,834] Trial 11 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:32,286] Trial 12 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:37,423] Trial 13 finished with value: 0.35118553042411804 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605. [I 2024-09-20 16:31:42,827] Trial 14 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
Best params: {'n_clusters': 4, 'init': 'random'}
The Silhouette score is 0.718830943107605 The Callinski index is 10019.619140625 Dataset with Labels from KMeans in Exp 3 gender gender:confidence Cluster_Label 0 0.0 1.0000 2 1 0.0 1.0000 2 2 0.0 0.6625 2 3 0.0 1.0000 2 4 1.0 1.0000 2 Records found in cluster 0 from KMeans in Exp 3 gender gender:confidence Cluster_Label 42 2.0 1.0000 0 62 1.0 1.0000 0 166 0.0 1.0000 0 173 2.0 1.0000 0 190 2.0 0.6780 0 ... ... ... ... 18624 1.0 1.0000 0 18654 0.0 1.0000 0 18656 1.0 1.0000 0 18673 0.0 1.0000 0 18722 1.0 0.3371 0 [865 rows x 3 columns] No. of records with gender 0 in cluster 0 is 307 No. of records with gender 1 in cluster 0 is 272 No. of records with gender 2 in cluster 0 is 286 Records found in cluster 1 from KMeans in Exp 3 gender gender:confidence Cluster_Label 113 1.0 1.0000 1 230 1.0 0.6755 1 502 0.0 1.0000 1 578 1.0 1.0000 1 644 0.0 1.0000 1 ... ... ... ... 17448 0.0 1.0000 1 18208 0.0 1.0000 1 18679 1.0 1.0000 1 18753 0.0 0.6678 1 18824 2.0 1.0000 1 [469 rows x 3 columns] No. of records with gender 0 in cluster 1 is 122 No. of records with gender 1 in cluster 1 is 146 No. of records with gender 2 in cluster 1 is 201 Records found in cluster 2 from KMeans in Exp 3 gender gender:confidence Cluster_Label 0 0.0 1.0000 2 1 0.0 1.0000 2 2 0.0 0.6625 2 3 0.0 1.0000 2 4 1.0 1.0000 2 ... ... ... ... 18829 1.0 1.0000 2 18831 0.0 0.6466 2 18832 1.0 1.0000 2 18834 1.0 1.0000 2 18835 0.0 0.6772 2 [15810 rows x 3 columns] No. of records with gender 0 in cluster 2 is 5260 No. of records with gender 1 in cluster 2 is 5607 No. of records with gender 2 in cluster 2 is 4943 Records found in cluster 3 from KMeans in Exp 3
[I 2024-09-20 16:31:50,732] A new study created in memory with name: no-name-3c57886b-d112-4b57-9312-be650d49f11f
gender gender:confidence Cluster_Label 261 1.0 1.0 3 336 0.0 1.0 3 575 0.0 1.0 3 929 1.0 1.0 3 1172 0.0 1.0 3 ... ... ... ... 18510 1.0 1.0 3 18609 1.0 1.0 3 18731 1.0 1.0 3 18738 2.0 1.0 3 18764 1.0 1.0 3 [558 rows x 3 columns] No. of records with gender 0 in cluster 3 is 154 No. of records with gender 1 in cluster 3 is 176 No. of records with gender 2 in cluster 3 is 228 Performing DBSCAN Clustering...
[I 2024-09-20 16:31:56,283] Trial 0 finished with value: 0.07267794013023376 and parameters: {'eps': 0.224350065881816, 'min_samples': 12}. Best is trial 0 with value: 0.07267794013023376. [I 2024-09-20 16:32:03,308] Trial 1 finished with value: 0.5760640501976013 and parameters: {'eps': 1.981346472478528, 'min_samples': 13}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:09,091] Trial 2 finished with value: 0.456826776266098 and parameters: {'eps': 0.5994244424252012, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:15,888] Trial 3 finished with value: 0.5654299259185791 and parameters: {'eps': 1.6306995770185833, 'min_samples': 12}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:21,989] Trial 4 finished with value: 0.40619421005249023 and parameters: {'eps': 0.9888580285943404, 'min_samples': 7}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:27,401] Trial 5 finished with value: 0.0672125294804573 and parameters: {'eps': 0.2212632309988171, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:33,226] Trial 6 finished with value: 0.3273886442184448 and parameters: {'eps': 0.8245746726811352, 'min_samples': 5}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:39,816] Trial 7 finished with value: 0.5654299259185791 and parameters: {'eps': 1.5980627491197157, 'min_samples': 11}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:46,496] Trial 8 finished with value: 0.5481722950935364 and parameters: {'eps': 1.6967788919940698, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:32:53,073] Trial 9 finished with value: 0.45212018489837646 and parameters: {'eps': 1.5229068478428347, 'min_samples': 9}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:33:00,076] Trial 10 finished with value: 0.5555833578109741 and parameters: {'eps': 1.9667767292648453, 'min_samples': 20}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:33:06,982] Trial 11 finished with value: 0.5760640501976013 and parameters: {'eps': 1.9662951357354617, 'min_samples': 14}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:33:13,834] Trial 12 finished with value: 0.5699435472488403 and parameters: {'eps': 1.964896277269623, 'min_samples': 16}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:33:20,152] Trial 13 finished with value: 0.563732922077179 and parameters: {'eps': 1.3014195484895565, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013. [I 2024-09-20 16:33:26,409] Trial 14 finished with value: 0.5307610034942627 and parameters: {'eps': 1.271279456944655, 'min_samples': 18}. Best is trial 1 with value: 0.5760640501976013.
Found best params: {'eps': 1.981346472478528, 'min_samples': 13}
The Silhouette score is 0.5760640501976013 The Callinski index is 1357.848876953125 Dataset with Labels from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 Records found in cluster 0 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 0 0.0 1.0000 0 1 0.0 1.0000 0 2 0.0 0.6625 0 3 0.0 1.0000 0 4 1.0 1.0000 0 ... ... ... ... 18829 1.0 1.0000 0 18831 0.0 0.6466 0 18832 1.0 1.0000 0 18834 1.0 1.0000 0 18835 0.0 0.6772 0 [15963 rows x 3 columns] No. of records with gender 0 in cluster 0 is 5340 No. of records with gender 1 in cluster 0 is 5665 No. of records with gender 2 in cluster 0 is 4958 Records found in cluster 1 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 42 2.0 1.000 1 190 2.0 0.678 1 211 2.0 1.000 1 252 2.0 1.000 1 255 1.0 1.000 1 ... ... ... ... 18546 1.0 1.000 1 18573 0.0 1.000 1 18584 1.0 1.000 1 18624 1.0 1.000 1 18656 1.0 1.000 1 [148 rows x 3 columns] No. of records with gender 0 in cluster 1 is 44 No. of records with gender 1 in cluster 1 is 52 No. of records with gender 2 in cluster 1 is 52 Records found in cluster 2 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 113 1.0 1.0000 2 6301 2.0 1.0000 2 6302 2.0 1.0000 2 6309 1.0 0.3750 2 6311 2.0 1.0000 2 6318 1.0 1.0000 2 6319 0.0 0.6471 2 6327 2.0 0.6733 2 6332 0.0 1.0000 2 6358 2.0 0.6692 2 6366 2.0 0.6662 2 6373 2.0 1.0000 2 6374 2.0 1.0000 2 6378 0.0 1.0000 2 6381 1.0 1.0000 2 6383 2.0 0.6754 2 6389 0.0 1.0000 2 6390 1.0 1.0000 2 6391 1.0 1.0000 2 6393 2.0 1.0000 2 6397 1.0 1.0000 2 6398 2.0 1.0000 2 6399 1.0 1.0000 2 8850 0.0 1.0000 2 11402 0.0 1.0000 2 12450 1.0 1.0000 2 13813 0.0 1.0000 2 No. of records with gender 0 in cluster 2 is 7 No. of records with gender 1 in cluster 2 is 9 No. of records with gender 2 in cluster 2 is 11 Records found in cluster 3 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 230 1.0 0.6755 3 7500 1.0 1.0000 3 7502 1.0 0.6617 3 7505 0.0 1.0000 3 7507 0.0 0.6848 3 7508 0.0 1.0000 3 7509 1.0 1.0000 3 7510 0.0 1.0000 3 7511 2.0 1.0000 3 7512 1.0 0.6739 3 7513 0.0 1.0000 3 7524 1.0 1.0000 3 7531 2.0 1.0000 3 7532 2.0 1.0000 3 7534 2.0 1.0000 3 7581 1.0 1.0000 3 7586 0.0 1.0000 3 7593 2.0 1.0000 3 7596 0.0 1.0000 3 7598 2.0 1.0000 3 12002 0.0 1.0000 3 No. of records with gender 0 in cluster 3 is 8 No. of records with gender 1 in cluster 3 is 7 No. of records with gender 2 in cluster 3 is 6 Records found in cluster 4 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 261 1.0 1.0000 4 336 0.0 1.0000 4 929 1.0 1.0000 4 1172 0.0 1.0000 4 1455 1.0 0.6678 4 1686 0.0 1.0000 4 3378 2.0 0.6688 4 3521 2.0 1.0000 4 3544 2.0 1.0000 4 5605 1.0 1.0000 4 5611 2.0 0.6856 4 5616 2.0 1.0000 4 5625 0.0 1.0000 4 5626 2.0 0.6589 4 5632 2.0 0.6651 4 5643 0.0 1.0000 4 5644 1.0 0.6725 4 5661 2.0 1.0000 4 5665 1.0 1.0000 4 5669 2.0 1.0000 4 5670 1.0 0.6752 4 5671 2.0 0.3424 4 5672 2.0 1.0000 4 5673 0.0 0.6761 4 5674 1.0 1.0000 4 5675 2.0 1.0000 4 5679 2.0 0.6816 4 5681 1.0 1.0000 4 5683 2.0 1.0000 4 5685 0.0 1.0000 4 5686 2.0 1.0000 4 5687 2.0 0.6799 4 5689 2.0 0.6805 4 5696 0.0 1.0000 4 5697 0.0 0.6892 4 9387 2.0 1.0000 4 10074 1.0 0.6741 4 10109 1.0 1.0000 4 10453 2.0 1.0000 4 10729 2.0 1.0000 4 10792 0.0 1.0000 4 10928 2.0 0.6605 4 11125 0.0 1.0000 4 11755 1.0 1.0000 4 11820 1.0 1.0000 4 13497 2.0 1.0000 4 14339 1.0 1.0000 4 14581 1.0 1.0000 4 14835 1.0 1.0000 4 15275 1.0 1.0000 4 15841 1.0 1.0000 4 15900 1.0 1.0000 4 15985 2.0 1.0000 4 16529 1.0 1.0000 4 17107 0.0 0.6597 4 17846 1.0 1.0000 4 18764 1.0 1.0000 4 No. of records with gender 0 in cluster 4 is 12 No. of records with gender 1 in cluster 4 is 22 No. of records with gender 2 in cluster 4 is 23 Records found in cluster 5 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 264 0.0 1.0000 5 2740 0.0 1.0000 5 4012 1.0 1.0000 5 4097 0.0 0.6706 5 4100 2.0 1.0000 5 ... ... ... ... 16986 0.0 1.0000 5 17182 0.0 1.0000 5 18083 0.0 1.0000 5 18789 0.0 1.0000 5 18803 1.0 1.0000 5 [131 rows x 3 columns] No. of records with gender 0 in cluster 5 is 48 No. of records with gender 1 in cluster 5 is 33 No. of records with gender 2 in cluster 5 is 50 Records found in cluster 6 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 348 1.0 1.0000 6 3814 0.0 1.0000 6 8923 2.0 1.0000 6 8925 0.0 1.0000 6 8927 1.0 1.0000 6 8930 2.0 1.0000 6 8940 2.0 0.6815 6 8943 2.0 1.0000 6 8944 2.0 0.6641 6 8945 0.0 1.0000 6 8947 2.0 1.0000 6 8948 2.0 1.0000 6 8951 0.0 0.6752 6 8952 1.0 0.6734 6 8953 1.0 1.0000 6 8954 2.0 1.0000 6 8965 2.0 1.0000 6 8971 1.0 1.0000 6 8981 1.0 1.0000 6 8987 2.0 1.0000 6 8988 0.0 1.0000 6 8989 1.0 1.0000 6 8990 2.0 1.0000 6 8991 2.0 0.6728 6 8995 2.0 0.6761 6 8997 0.0 1.0000 6 15702 1.0 0.6739 6 16019 0.0 1.0000 6 16293 0.0 1.0000 6 16469 2.0 0.6755 6 No. of records with gender 0 in cluster 6 is 8 No. of records with gender 1 in cluster 6 is 8 No. of records with gender 2 in cluster 6 is 14 Records found in cluster 7 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 431 0.0 0.6631 7 4374 2.0 1.0000 7 4456 1.0 1.0000 7 4653 2.0 1.0000 7 4995 2.0 1.0000 7 5220 2.0 0.6650 7 5372 2.0 1.0000 7 5749 2.0 1.0000 7 6043 2.0 0.6787 7 6172 2.0 1.0000 7 6208 1.0 0.6543 7 6496 2.0 0.6716 7 6669 0.0 1.0000 7 7060 1.0 0.6890 7 7261 0.0 1.0000 7 7439 0.0 1.0000 7 7683 1.0 0.6699 7 7902 0.0 1.0000 7 8120 1.0 1.0000 7 8360 2.0 0.6854 7 8408 0.0 1.0000 7 9100 0.0 1.0000 7 9333 1.0 1.0000 7 10448 2.0 0.6544 7 10820 0.0 0.6635 7 12961 1.0 1.0000 7 13252 1.0 1.0000 7 13603 1.0 1.0000 7 14102 0.0 1.0000 7 14844 0.0 1.0000 7 15017 1.0 1.0000 7 No. of records with gender 0 in cluster 7 is 10 No. of records with gender 1 in cluster 7 is 10 No. of records with gender 2 in cluster 7 is 11 Records found in cluster 8 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 502 0.0 1.0000 8 578 1.0 1.0000 8 644 0.0 1.0000 8 771 0.0 1.0000 8 963 2.0 1.0000 8 1433 1.0 1.0000 8 1881 0.0 0.6691 8 2762 2.0 0.6670 8 2903 1.0 0.6763 8 3308 0.0 0.3364 8 3353 0.0 1.0000 8 3681 2.0 1.0000 8 3830 0.0 1.0000 8 4305 1.0 1.0000 8 5040 0.0 1.0000 8 5479 0.0 0.6857 8 5742 0.0 1.0000 8 6460 2.0 1.0000 8 6862 1.0 1.0000 8 8397 2.0 0.6634 8 8516 2.0 0.6839 8 8918 2.0 1.0000 8 No. of records with gender 0 in cluster 8 is 10 No. of records with gender 1 in cluster 8 is 5 No. of records with gender 2 in cluster 8 is 7 Records found in cluster 9 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 575 0.0 1.0000 9 1308 0.0 0.6479 9 2033 1.0 1.0000 9 2308 1.0 0.6774 9 3898 0.0 1.0000 9 5454 2.0 0.6774 9 5539 1.0 1.0000 9 5628 2.0 1.0000 9 5825 1.0 1.0000 9 5847 2.0 0.6717 9 6012 0.0 1.0000 9 6048 2.0 0.6796 9 6114 1.0 0.6620 9 6335 2.0 1.0000 9 6382 2.0 0.6842 9 6417 2.0 1.0000 9 7843 2.0 1.0000 9 8181 0.0 1.0000 9 8355 2.0 0.6778 9 8738 0.0 1.0000 9 No. of records with gender 0 in cluster 9 is 6 No. of records with gender 1 in cluster 9 is 5 No. of records with gender 2 in cluster 9 is 9 Records found in cluster 10 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 625 1.0 1.0000 10 7101 2.0 1.0000 10 7102 0.0 1.0000 10 7105 0.0 1.0000 10 7109 1.0 1.0000 10 7113 2.0 0.6718 10 7115 0.0 0.3451 10 7123 0.0 1.0000 10 7128 2.0 0.6585 10 7130 2.0 1.0000 10 7136 1.0 0.6835 10 7148 0.0 0.6750 10 7153 1.0 1.0000 10 7158 1.0 1.0000 10 7162 1.0 1.0000 10 7166 2.0 0.6635 10 7176 1.0 1.0000 10 7184 2.0 1.0000 10 No. of records with gender 0 in cluster 10 is 5 No. of records with gender 1 in cluster 10 is 7 No. of records with gender 2 in cluster 10 is 6 Records found in cluster 11 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 780 1.0 1.0000 11 2335 0.0 1.0000 11 4874 0.0 1.0000 11 5800 1.0 1.0000 11 5807 1.0 1.0000 11 5809 0.0 1.0000 11 5810 1.0 1.0000 11 5819 2.0 0.6667 11 5835 2.0 1.0000 11 5838 2.0 1.0000 11 5841 2.0 0.6645 11 5843 0.0 0.6658 11 5846 2.0 1.0000 11 5849 0.0 0.6792 11 5861 2.0 0.6808 11 5862 0.0 1.0000 11 5868 1.0 1.0000 11 5869 1.0 1.0000 11 5870 0.0 0.3441 11 5877 1.0 1.0000 11 5881 2.0 1.0000 11 5883 2.0 0.6725 11 5885 2.0 0.6640 11 5894 1.0 1.0000 11 5898 2.0 0.6675 11 8449 2.0 1.0000 11 10879 0.0 1.0000 11 18679 1.0 1.0000 11 No. of records with gender 0 in cluster 11 is 8 No. of records with gender 1 in cluster 11 is 9 No. of records with gender 2 in cluster 11 is 11 Records found in cluster 12 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 792 2.0 1.0000 12 7804 1.0 1.0000 12 7810 2.0 1.0000 12 7811 2.0 0.6341 12 7817 0.0 1.0000 12 7819 0.0 1.0000 12 7820 0.0 1.0000 12 7821 2.0 1.0000 12 7822 2.0 1.0000 12 7824 2.0 1.0000 12 7825 0.0 1.0000 12 7827 2.0 0.3472 12 7830 1.0 1.0000 12 7882 0.0 1.0000 12 7888 2.0 0.6506 12 7890 2.0 1.0000 12 7892 2.0 1.0000 12 7897 0.0 0.6803 12 7899 1.0 1.0000 12 8203 2.0 1.0000 12 8204 2.0 0.6746 12 8208 2.0 0.6844 12 8236 0.0 1.0000 12 8246 2.0 0.6598 12 8247 1.0 1.0000 12 8250 1.0 1.0000 12 8251 0.0 0.6624 12 8261 1.0 1.0000 12 8264 0.0 1.0000 12 8269 0.0 0.6774 12 8272 2.0 1.0000 12 8284 2.0 0.6691 12 8488 1.0 1.0000 12 13379 0.0 1.0000 12 No. of records with gender 0 in cluster 12 is 11 No. of records with gender 1 in cluster 12 is 7 No. of records with gender 2 in cluster 12 is 16 Records found in cluster 13 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 1203 1.0 1.0000 13 1240 1.0 0.6889 13 2115 0.0 1.0000 13 2381 0.0 1.0000 13 3988 2.0 1.0000 13 5994 2.0 0.6611 13 7988 1.0 0.6734 13 8071 1.0 1.0000 13 10735 0.0 1.0000 13 10738 0.0 1.0000 13 11076 2.0 1.0000 13 11179 2.0 1.0000 13 11484 1.0 1.0000 13 11648 1.0 1.0000 13 11746 0.0 1.0000 13 12054 1.0 1.0000 13 13078 0.0 1.0000 13 14056 2.0 1.0000 13 15064 0.0 0.6534 13 15751 1.0 1.0000 13 15757 1.0 1.0000 13 16465 0.0 1.0000 13 16868 1.0 1.0000 13 17448 0.0 1.0000 13 18208 0.0 1.0000 13 18753 0.0 0.6678 13 No. of records with gender 0 in cluster 13 is 11 No. of records with gender 1 in cluster 13 is 10 No. of records with gender 2 in cluster 13 is 5 Records found in cluster 14 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 1273 0.0 1.0000 14 1605 2.0 1.0000 14 1761 2.0 1.0000 14 1845 1.0 1.0000 14 1987 1.0 1.0000 14 2274 0.0 1.0000 14 3961 0.0 1.0000 14 4092 0.0 0.3411 14 4424 2.0 1.0000 14 5218 2.0 1.0000 14 5336 1.0 1.0000 14 5445 0.0 1.0000 14 6262 2.0 1.0000 14 6289 1.0 1.0000 14 7003 1.0 1.0000 14 7118 2.0 1.0000 14 7431 1.0 1.0000 14 7540 0.0 0.6859 14 7791 1.0 1.0000 14 8142 2.0 1.0000 14 8601 2.0 0.6700 14 8693 0.0 1.0000 14 9023 1.0 0.6654 14 9265 1.0 1.0000 14 No. of records with gender 0 in cluster 14 is 7 No. of records with gender 1 in cluster 14 is 9 No. of records with gender 2 in cluster 14 is 8 Records found in cluster 15 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 1474 1.0 0.3390 15 1582 1.0 1.0000 15 1940 2.0 0.6675 15 3133 0.0 1.0000 15 3252 0.0 1.0000 15 ... ... ... ... 14750 1.0 1.0000 15 15816 1.0 1.0000 15 17319 1.0 1.0000 15 17504 0.0 0.6567 15 18609 1.0 1.0000 15 [103 rows x 3 columns] No. of records with gender 0 in cluster 15 is 34 No. of records with gender 1 in cluster 15 is 31 No. of records with gender 2 in cluster 15 is 38 Records found in cluster 16 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 1646 0.0 0.6576 16 1868 0.0 1.0000 16 3803 0.0 1.0000 16 3962 2.0 1.0000 16 5400 0.0 1.0000 16 5401 2.0 0.6836 16 5407 2.0 0.6785 16 5408 2.0 1.0000 16 5409 0.0 1.0000 16 5412 2.0 1.0000 16 5427 1.0 1.0000 16 5429 0.0 1.0000 16 5433 2.0 0.6736 16 5434 1.0 1.0000 16 5436 2.0 0.6602 16 5442 1.0 0.3409 16 5443 2.0 0.6483 16 5447 1.0 1.0000 16 5448 2.0 0.6654 16 5449 1.0 1.0000 16 5456 0.0 1.0000 16 5457 2.0 0.6468 16 5466 2.0 1.0000 16 5470 2.0 1.0000 16 5471 0.0 1.0000 16 5472 0.0 1.0000 16 5480 1.0 1.0000 16 5485 2.0 1.0000 16 5486 1.0 1.0000 16 5487 1.0 0.6669 16 5490 2.0 1.0000 16 5491 2.0 1.0000 16 7364 1.0 1.0000 16 9547 2.0 1.0000 16 17851 2.0 0.6495 16 No. of records with gender 0 in cluster 16 is 9 No. of records with gender 1 in cluster 16 is 9 No. of records with gender 2 in cluster 16 is 17 Records found in cluster 17 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 1673 1.0 1.0000 17 2702 1.0 1.0000 17 7600 0.0 1.0000 17 7601 2.0 0.6609 17 7611 0.0 0.6666 17 7613 2.0 1.0000 17 7614 2.0 0.6866 17 7615 2.0 1.0000 17 7620 1.0 0.6549 17 7621 1.0 1.0000 17 7622 2.0 1.0000 17 7626 0.0 1.0000 17 7627 0.0 0.7037 17 7629 2.0 1.0000 17 7652 0.0 0.6772 17 7655 1.0 1.0000 17 7662 0.0 1.0000 17 7665 2.0 0.6832 17 7667 0.0 1.0000 17 7669 2.0 1.0000 17 7670 1.0 1.0000 17 7672 2.0 1.0000 17 7679 1.0 1.0000 17 7680 1.0 1.0000 17 7681 1.0 1.0000 17 7686 2.0 1.0000 17 7694 2.0 1.0000 17 7697 1.0 1.0000 17 16509 1.0 1.0000 17 No. of records with gender 0 in cluster 17 is 7 No. of records with gender 1 in cluster 17 is 11 No. of records with gender 2 in cluster 17 is 11 Records found in cluster 18 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2046 0.0 0.6531 18 3257 2.0 1.0000 18 7002 2.0 1.0000 18 7016 0.0 1.0000 18 7017 2.0 0.6646 18 7033 1.0 1.0000 18 7040 1.0 1.0000 18 7043 0.0 1.0000 18 7048 2.0 1.0000 18 7052 2.0 0.6595 18 7053 2.0 1.0000 18 7058 1.0 1.0000 18 7062 0.0 1.0000 18 7065 2.0 1.0000 18 7087 2.0 0.6671 18 7091 1.0 0.6642 18 7095 2.0 1.0000 18 7096 2.0 0.6782 18 7097 2.0 0.6788 18 8775 1.0 0.6609 18 No. of records with gender 0 in cluster 18 is 4 No. of records with gender 1 in cluster 18 is 5 No. of records with gender 2 in cluster 18 is 11 Records found in cluster 19 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2135 2.0 1.0000 19 3978 1.0 1.0000 19 5034 0.0 1.0000 19 5208 0.0 1.0000 19 5364 2.0 1.0000 19 5513 0.0 1.0000 19 5677 1.0 1.0000 19 5817 0.0 1.0000 19 5929 1.0 1.0000 19 6085 0.0 1.0000 19 6257 2.0 0.6874 19 6679 1.0 1.0000 19 6819 2.0 0.6537 19 7029 0.0 1.0000 19 7121 0.0 1.0000 19 9044 1.0 1.0000 19 No. of records with gender 0 in cluster 19 is 7 No. of records with gender 1 in cluster 19 is 5 No. of records with gender 2 in cluster 19 is 4 Records found in cluster 20 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2138 1.0 1.0000 20 2145 0.0 1.0000 20 2146 1.0 1.0000 20 2147 1.0 1.0000 20 2148 1.0 0.3576 20 2156 0.0 1.0000 20 2166 1.0 1.0000 20 2168 0.0 0.6825 20 2169 1.0 1.0000 20 2171 1.0 1.0000 20 2172 0.0 1.0000 20 2182 2.0 1.0000 20 2185 0.0 1.0000 20 2186 0.0 0.3403 20 2187 1.0 1.0000 20 2188 2.0 0.6812 20 2189 0.0 0.6582 20 2191 0.0 1.0000 20 2194 1.0 1.0000 20 2196 1.0 1.0000 20 2204 1.0 0.6587 20 2205 0.0 0.6685 20 2206 1.0 0.6551 20 2207 1.0 1.0000 20 2210 1.0 1.0000 20 2216 1.0 0.6896 20 2217 1.0 0.6832 20 2220 1.0 1.0000 20 2223 2.0 1.0000 20 14626 0.0 1.0000 20 No. of records with gender 0 in cluster 20 is 10 No. of records with gender 1 in cluster 20 is 17 No. of records with gender 2 in cluster 20 is 3 Records found in cluster 21 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2240 0.0 1.0000 21 3269 1.0 0.3394 21 9001 1.0 1.0000 21 9020 0.0 1.0000 21 9028 1.0 0.6849 21 9033 0.0 1.0000 21 9038 1.0 0.6667 21 9043 1.0 1.0000 21 9046 2.0 0.6745 21 9050 1.0 0.6658 21 9052 2.0 0.6826 21 9054 1.0 1.0000 21 9055 1.0 1.0000 21 9056 2.0 1.0000 21 9061 0.0 1.0000 21 9064 2.0 1.0000 21 9069 2.0 0.6595 21 9070 0.0 1.0000 21 9072 1.0 0.6774 21 9076 2.0 1.0000 21 9079 0.0 1.0000 21 9080 1.0 0.6532 21 9081 0.0 1.0000 21 9082 0.0 1.0000 21 9083 0.0 1.0000 21 9089 1.0 1.0000 21 9952 2.0 0.3548 21 14813 1.0 0.6875 21 15564 0.0 1.0000 21 18157 1.0 1.0000 21 No. of records with gender 0 in cluster 21 is 10 No. of records with gender 1 in cluster 21 is 13 No. of records with gender 2 in cluster 21 is 7 Records found in cluster 22 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2512 1.0 1.0000 22 8502 2.0 1.0000 22 8505 1.0 1.0000 22 8506 2.0 1.0000 22 8507 0.0 1.0000 22 8520 2.0 0.6820 22 8525 0.0 1.0000 22 8528 1.0 1.0000 22 8531 2.0 0.6681 22 8535 2.0 1.0000 22 8540 2.0 1.0000 22 8541 2.0 1.0000 22 8542 2.0 1.0000 22 8546 1.0 1.0000 22 8553 2.0 1.0000 22 8554 0.0 1.0000 22 8557 0.0 1.0000 22 8562 0.0 1.0000 22 8563 1.0 1.0000 22 8564 2.0 1.0000 22 8565 0.0 0.6862 22 8568 1.0 1.0000 22 8580 2.0 1.0000 22 8583 0.0 1.0000 22 8586 0.0 0.6453 22 13204 1.0 1.0000 22 16912 1.0 0.6483 22 16945 0.0 1.0000 22 No. of records with gender 0 in cluster 22 is 9 No. of records with gender 1 in cluster 22 is 8 No. of records with gender 2 in cluster 22 is 11 Records found in cluster 23 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2730 1.0 1.0000 23 3086 0.0 1.0000 23 5506 2.0 0.6595 23 5511 1.0 1.0000 23 5524 0.0 0.6722 23 5541 0.0 1.0000 23 5542 2.0 1.0000 23 5544 1.0 0.3374 23 5546 1.0 1.0000 23 5552 2.0 1.0000 23 5558 2.0 1.0000 23 5559 2.0 0.6745 23 5560 1.0 1.0000 23 5561 0.0 1.0000 23 5563 2.0 1.0000 23 5564 2.0 1.0000 23 5566 1.0 0.6607 23 5570 2.0 1.0000 23 5572 1.0 1.0000 23 5579 1.0 1.0000 23 5583 2.0 1.0000 23 5588 0.0 0.6795 23 5597 0.0 1.0000 23 5598 0.0 1.0000 23 6067 1.0 1.0000 23 10803 0.0 1.0000 23 12037 0.0 1.0000 23 12202 0.0 1.0000 23 14307 2.0 0.6617 23 16093 0.0 0.3575 23 17031 1.0 1.0000 23 17498 1.0 1.0000 23 No. of records with gender 0 in cluster 23 is 11 No. of records with gender 1 in cluster 23 is 11 No. of records with gender 2 in cluster 23 is 10 Records found in cluster 24 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 2928 2.0 0.6734 24 7703 1.0 1.0000 24 7705 1.0 1.0000 24 7727 2.0 1.0000 24 7738 2.0 1.0000 24 7743 0.0 1.0000 24 7745 1.0 1.0000 24 7746 2.0 1.0000 24 7747 2.0 0.6745 24 7748 2.0 1.0000 24 7751 2.0 1.0000 24 7752 1.0 0.6649 24 7757 2.0 1.0000 24 7759 2.0 1.0000 24 7760 2.0 1.0000 24 7761 1.0 1.0000 24 7793 0.0 0.6691 24 7797 2.0 0.6600 24 10622 1.0 0.6692 24 No. of records with gender 0 in cluster 24 is 2 No. of records with gender 1 in cluster 24 is 6 No. of records with gender 2 in cluster 24 is 11 Records found in cluster 25 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 3581 0.0 1.0000 25 3705 2.0 0.6581 25 3809 2.0 1.0000 25 3906 1.0 0.6422 25 4041 0.0 1.0000 25 4156 1.0 1.0000 25 4272 2.0 1.0000 25 4341 0.0 1.0000 25 4410 2.0 1.0000 25 4508 1.0 1.0000 25 4631 2.0 1.0000 25 4736 2.0 1.0000 25 4840 2.0 1.0000 25 5305 1.0 1.0000 25 No. of records with gender 0 in cluster 25 is 3 No. of records with gender 1 in cluster 25 is 4 No. of records with gender 2 in cluster 25 is 7 Records found in cluster 26 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 3744 0.0 0.6440 26 3927 0.0 1.0000 26 3994 1.0 1.0000 26 4057 2.0 0.3516 26 4300 2.0 0.6736 26 4398 1.0 1.0000 26 4470 2.0 0.6602 26 4544 0.0 1.0000 26 4640 2.0 1.0000 26 4800 2.0 0.6575 26 4883 2.0 1.0000 26 5043 1.0 1.0000 26 5238 1.0 1.0000 26 5325 1.0 0.6645 26 5515 2.0 1.0000 26 5659 1.0 1.0000 26 5978 2.0 1.0000 26 6188 2.0 0.6748 26 6440 2.0 1.0000 26 6562 0.0 1.0000 26 6671 2.0 1.0000 26 6749 1.0 1.0000 26 6826 2.0 0.6933 26 7050 0.0 0.6736 26 No. of records with gender 0 in cluster 26 is 5 No. of records with gender 1 in cluster 26 is 7 No. of records with gender 2 in cluster 26 is 12 Records found in cluster 27 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 4093 2.0 1.0000 27 4485 2.0 1.0000 27 4893 2.0 1.0000 27 6095 2.0 0.6602 27 6412 2.0 1.0000 27 7079 2.0 1.0000 27 8501 0.0 1.0000 27 8968 2.0 1.0000 27 9965 0.0 1.0000 27 10058 2.0 1.0000 27 10070 1.0 1.0000 27 10084 0.0 1.0000 27 10092 1.0 1.0000 27 10102 2.0 1.0000 27 10116 2.0 1.0000 27 10131 0.0 1.0000 27 10143 2.0 1.0000 27 10167 1.0 0.3495 27 10256 0.0 1.0000 27 10658 1.0 1.0000 27 11280 0.0 1.0000 27 14155 0.0 1.0000 27 14888 1.0 1.0000 27 No. of records with gender 0 in cluster 27 is 7 No. of records with gender 1 in cluster 27 is 5 No. of records with gender 2 in cluster 27 is 11 Records found in cluster 28 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 4606 0.0 1.0000 28 4608 0.0 0.6618 28 4615 2.0 0.6590 28 4621 1.0 1.0000 28 4627 2.0 1.0000 28 4643 0.0 1.0000 28 4657 2.0 0.6751 28 4664 1.0 1.0000 28 4674 2.0 1.0000 28 4675 2.0 1.0000 28 4685 2.0 1.0000 28 4690 0.0 0.6763 28 4691 0.0 1.0000 28 4710 2.0 1.0000 28 4712 0.0 1.0000 28 4717 2.0 1.0000 28 4720 2.0 1.0000 28 4722 2.0 0.6686 28 4731 1.0 1.0000 28 4743 2.0 1.0000 28 4746 1.0 1.0000 28 4772 2.0 1.0000 28 4778 1.0 0.3592 28 4780 2.0 1.0000 28 4781 2.0 0.6475 28 4782 1.0 0.6697 28 4783 2.0 1.0000 28 4785 2.0 0.6811 28 4789 2.0 1.0000 28 4790 1.0 1.0000 28 4798 2.0 0.6736 28 4799 0.0 1.0000 28 6627 2.0 1.0000 28 6629 1.0 1.0000 28 6633 0.0 1.0000 28 6650 2.0 1.0000 28 6654 1.0 1.0000 28 6660 2.0 1.0000 28 6664 2.0 1.0000 28 6665 1.0 1.0000 28 6668 0.0 1.0000 28 6670 0.0 1.0000 28 6678 1.0 1.0000 28 6685 2.0 1.0000 28 6688 2.0 1.0000 28 11370 2.0 1.0000 28 No. of records with gender 0 in cluster 28 is 10 No. of records with gender 1 in cluster 28 is 11 No. of records with gender 2 in cluster 28 is 25 Records found in cluster 29 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 4906 2.0 0.6681 29 4908 0.0 1.0000 29 4909 2.0 1.0000 29 4910 0.0 1.0000 29 4912 1.0 1.0000 29 4917 1.0 0.6571 29 4918 0.0 1.0000 29 4923 2.0 1.0000 29 4924 2.0 0.6585 29 4929 1.0 1.0000 29 4934 1.0 0.6571 29 4937 2.0 1.0000 29 4944 1.0 0.6711 29 4949 2.0 1.0000 29 4950 1.0 1.0000 29 4951 1.0 1.0000 29 4961 0.0 1.0000 29 4962 1.0 1.0000 29 4965 2.0 0.6695 29 4967 0.0 1.0000 29 4968 1.0 1.0000 29 4970 0.0 1.0000 29 4973 1.0 1.0000 29 4990 1.0 1.0000 29 4997 2.0 0.6957 29 4999 2.0 0.6884 29 13476 2.0 0.6742 29 16183 0.0 1.0000 29 18336 1.0 1.0000 29 No. of records with gender 0 in cluster 29 is 7 No. of records with gender 1 in cluster 29 is 12 No. of records with gender 2 in cluster 29 is 10 Records found in cluster 30 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 5002 1.0 1.0000 30 5007 0.0 1.0000 30 5014 1.0 1.0000 30 5017 0.0 1.0000 30 5021 1.0 1.0000 30 5030 0.0 1.0000 30 5049 1.0 0.6787 30 5065 1.0 1.0000 30 5069 2.0 0.6832 30 5075 2.0 0.6692 30 5084 2.0 1.0000 30 5086 0.0 1.0000 30 5088 2.0 1.0000 30 5090 2.0 1.0000 30 5094 1.0 1.0000 30 5095 2.0 0.6848 30 11521 2.0 0.6792 30 17014 2.0 1.0000 30 No. of records with gender 0 in cluster 30 is 4 No. of records with gender 1 in cluster 30 is 6 No. of records with gender 2 in cluster 30 is 8 Records found in cluster 31 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 5100 0.0 1.0000 31 5120 2.0 1.0000 31 5123 2.0 0.6741 31 5136 0.0 1.0000 31 5149 2.0 1.0000 31 5153 2.0 0.6735 31 5156 2.0 0.6516 31 5161 0.0 1.0000 31 5170 2.0 0.6606 31 5175 2.0 1.0000 31 5176 1.0 1.0000 31 5180 1.0 1.0000 31 5181 2.0 1.0000 31 5182 0.0 0.6801 31 5185 2.0 0.6822 31 5187 0.0 1.0000 31 5192 2.0 0.6835 31 No. of records with gender 0 in cluster 31 is 5 No. of records with gender 1 in cluster 31 is 2 No. of records with gender 2 in cluster 31 is 10 Records found in cluster 32 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 5200 0.0 1.0000 32 5203 1.0 1.0000 32 5205 1.0 0.6748 32 5209 1.0 1.0000 32 5211 0.0 0.6738 32 5217 0.0 1.0000 32 5227 1.0 1.0000 32 5232 1.0 1.0000 32 5234 1.0 1.0000 32 5242 1.0 1.0000 32 5256 2.0 0.6475 32 5262 0.0 0.6457 32 5264 0.0 1.0000 32 5265 1.0 1.0000 32 5266 0.0 1.0000 32 5270 2.0 1.0000 32 5271 2.0 0.6812 32 5272 2.0 1.0000 32 5284 1.0 0.6815 32 5289 0.0 1.0000 32 5291 2.0 0.6333 32 5297 0.0 1.0000 32 10620 1.0 1.0000 32 13921 2.0 0.6771 32 18824 2.0 1.0000 32 No. of records with gender 0 in cluster 32 is 8 No. of records with gender 1 in cluster 32 is 10 No. of records with gender 2 in cluster 32 is 7 Records found in cluster 33 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 5705 1.0 1.0000 33 5709 2.0 0.6860 33 5711 2.0 1.0000 33 5712 1.0 1.0000 33 5726 2.0 0.6735 33 5746 2.0 0.3410 33 5752 2.0 0.6747 33 5754 1.0 1.0000 33 5757 1.0 1.0000 33 5766 2.0 1.0000 33 5767 2.0 1.0000 33 5768 1.0 0.3631 33 5770 2.0 1.0000 33 5773 2.0 0.6769 33 5777 2.0 0.6638 33 5782 1.0 1.0000 33 5786 2.0 1.0000 33 5790 0.0 1.0000 33 5792 2.0 1.0000 33 5793 2.0 0.6675 33 5794 2.0 1.0000 33 5798 2.0 1.0000 33 10582 2.0 0.6383 33 11935 2.0 1.0000 33 15021 0.0 1.0000 33 16688 0.0 1.0000 33 No. of records with gender 0 in cluster 33 is 3 No. of records with gender 1 in cluster 33 is 6 No. of records with gender 2 in cluster 33 is 17 Records found in cluster 34 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 5901 2.0 1.0000 34 5902 0.0 0.6462 34 5904 0.0 1.0000 34 5910 0.0 0.6787 34 5914 2.0 1.0000 34 5930 0.0 0.6512 34 5932 0.0 1.0000 34 5934 2.0 1.0000 34 5935 2.0 1.0000 34 5936 2.0 0.6836 34 5945 2.0 1.0000 34 5952 0.0 1.0000 34 5954 0.0 1.0000 34 5956 2.0 1.0000 34 5961 2.0 1.0000 34 5962 1.0 1.0000 34 5963 0.0 1.0000 34 5964 1.0 1.0000 34 5965 2.0 0.6764 34 5966 2.0 0.6842 34 5973 2.0 0.6509 34 5986 0.0 1.0000 34 5989 2.0 1.0000 34 5990 0.0 0.6713 34 16757 1.0 1.0000 34 No. of records with gender 0 in cluster 34 is 10 No. of records with gender 1 in cluster 34 is 3 No. of records with gender 2 in cluster 34 is 12 Records found in cluster 35 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 6101 1.0 0.6543 35 6102 0.0 0.6699 35 6103 0.0 1.0000 35 6109 0.0 1.0000 35 6129 2.0 0.6778 35 6131 0.0 1.0000 35 6133 0.0 0.6655 35 6134 0.0 1.0000 35 6147 2.0 0.6540 35 6149 0.0 1.0000 35 6151 2.0 0.6642 35 6156 2.0 1.0000 35 6158 1.0 1.0000 35 6164 1.0 1.0000 35 6167 2.0 0.6742 35 6169 2.0 0.6866 35 6178 1.0 1.0000 35 6180 1.0 1.0000 35 6190 0.0 1.0000 35 6192 2.0 0.6652 35 6197 1.0 0.6513 35 No. of records with gender 0 in cluster 35 is 8 No. of records with gender 1 in cluster 35 is 6 No. of records with gender 2 in cluster 35 is 7 Records found in cluster 36 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 6502 0.0 1.0000 36 6505 2.0 1.0000 36 6516 0.0 1.0000 36 6521 2.0 1.0000 36 6523 1.0 1.0000 36 6540 2.0 1.0000 36 6549 2.0 1.0000 36 6555 2.0 1.0000 36 6556 1.0 1.0000 36 6559 0.0 1.0000 36 6560 0.0 1.0000 36 6565 2.0 0.6534 36 6567 2.0 1.0000 36 6569 2.0 1.0000 36 6575 1.0 1.0000 36 6576 2.0 1.0000 36 6577 2.0 1.0000 36 6579 2.0 0.6762 36 6580 2.0 1.0000 36 6581 1.0 1.0000 36 6583 2.0 1.0000 36 6596 1.0 1.0000 36 6599 2.0 1.0000 36 12899 2.0 1.0000 36 No. of records with gender 0 in cluster 36 is 4 No. of records with gender 1 in cluster 36 is 5 No. of records with gender 2 in cluster 36 is 15 Records found in cluster 37 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 6722 1.0 1.0000 37 6726 0.0 1.0000 37 6728 2.0 0.6634 37 6730 2.0 0.6681 37 6732 1.0 0.6882 37 6742 2.0 0.6625 37 6758 0.0 0.3469 37 6759 1.0 0.6543 37 6772 2.0 1.0000 37 6786 2.0 0.6694 37 6787 2.0 1.0000 37 6788 2.0 1.0000 37 6789 2.0 1.0000 37 6793 1.0 0.6699 37 6795 2.0 0.6741 37 No. of records with gender 0 in cluster 37 is 2 No. of records with gender 1 in cluster 37 is 4 No. of records with gender 2 in cluster 37 is 9 Records found in cluster 38 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 7210 0.0 0.6617 38 7215 2.0 1.0000 38 7216 2.0 0.6921 38 7228 2.0 0.6766 38 7230 1.0 1.0000 38 7234 0.0 1.0000 38 7250 2.0 1.0000 38 7258 1.0 0.6902 38 7259 0.0 1.0000 38 7260 2.0 1.0000 38 7266 2.0 1.0000 38 7273 1.0 1.0000 38 7277 0.0 0.3487 38 7284 0.0 0.6661 38 7288 2.0 1.0000 38 7297 2.0 0.6853 38 No. of records with gender 0 in cluster 38 is 5 No. of records with gender 1 in cluster 38 is 3 No. of records with gender 2 in cluster 38 is 8 Records found in cluster 39 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 7289 0.0 1.0000 39 12796 1.0 1.0000 39 13303 1.0 1.0000 39 13417 1.0 1.0000 39 13502 1.0 1.0000 39 13716 1.0 0.6830 39 13901 2.0 0.6611 39 14140 0.0 0.6645 39 14214 2.0 1.0000 39 14269 2.0 0.6868 39 14337 1.0 1.0000 39 14412 1.0 1.0000 39 14483 0.0 1.0000 39 14645 1.0 1.0000 39 15443 2.0 1.0000 39 15534 0.0 1.0000 39 15807 0.0 1.0000 39 15916 1.0 1.0000 39 16188 1.0 1.0000 39 16418 2.0 1.0000 39 16672 1.0 1.0000 39 16725 1.0 1.0000 39 17269 0.0 1.0000 39 17351 1.0 0.6556 39 17442 1.0 1.0000 39 17842 0.0 1.0000 39 18412 2.0 0.6690 39 18510 1.0 1.0000 39 18731 1.0 1.0000 39 18738 2.0 1.0000 39 No. of records with gender 0 in cluster 39 is 7 No. of records with gender 1 in cluster 39 is 16 No. of records with gender 2 in cluster 39 is 7 Records found in cluster 40 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 7381 2.0 1.0000 40 7470 1.0 0.6810 40 7542 0.0 1.0000 40 7616 2.0 0.6675 40 7675 2.0 1.0000 40 ... ... ... ... 15207 1.0 1.0000 40 15391 2.0 1.0000 40 15439 2.0 1.0000 40 15622 2.0 1.0000 40 17122 2.0 0.6583 40 [98 rows x 3 columns] No. of records with gender 0 in cluster 40 is 25 No. of records with gender 1 in cluster 40 is 38 No. of records with gender 2 in cluster 40 is 35 Records found in cluster 41 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 7416 2.0 1.0000 41 7417 1.0 1.0000 41 7418 2.0 1.0000 41 7421 2.0 0.6802 41 7429 2.0 1.0000 41 7430 2.0 0.6812 41 7434 2.0 1.0000 41 7440 0.0 1.0000 41 7441 1.0 1.0000 41 7442 2.0 1.0000 41 7448 0.0 1.0000 41 7458 2.0 1.0000 41 7459 2.0 1.0000 41 7496 2.0 0.6703 41 7497 0.0 0.6799 41 No. of records with gender 0 in cluster 41 is 3 No. of records with gender 1 in cluster 41 is 2 No. of records with gender 2 in cluster 41 is 10 Records found in cluster 42 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 7900 2.0 1.0000 42 7908 2.0 1.0000 42 7910 2.0 1.0000 42 7914 2.0 1.0000 42 7933 1.0 1.0000 42 7953 0.0 1.0000 42 7956 1.0 1.0000 42 7958 1.0 1.0000 42 7959 0.0 0.6823 42 7963 2.0 1.0000 42 7964 2.0 1.0000 42 7966 0.0 0.6607 42 7967 2.0 0.6737 42 7968 2.0 1.0000 42 7973 0.0 1.0000 42 7975 0.0 1.0000 42 7976 0.0 1.0000 42 7977 2.0 0.6739 42 7980 2.0 1.0000 42 7987 0.0 1.0000 42 7991 1.0 1.0000 42 7999 2.0 0.6726 42 No. of records with gender 0 in cluster 42 is 7 No. of records with gender 1 in cluster 42 is 4 No. of records with gender 2 in cluster 42 is 11 Records found in cluster 43 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8024 2.0 1.0000 43 8033 0.0 0.6701 43 8039 1.0 1.0000 43 8046 2.0 1.0000 43 8050 2.0 1.0000 43 8052 0.0 0.7050 43 8055 0.0 1.0000 43 8057 1.0 1.0000 43 8058 2.0 1.0000 43 8059 1.0 1.0000 43 8062 1.0 1.0000 43 8063 1.0 1.0000 43 8065 1.0 0.6688 43 8067 2.0 0.3442 43 8068 1.0 1.0000 43 8070 1.0 0.6698 43 8078 0.0 1.0000 43 8081 2.0 1.0000 43 8085 0.0 1.0000 43 8097 0.0 1.0000 43 16604 1.0 1.0000 43 No. of records with gender 0 in cluster 43 is 6 No. of records with gender 1 in cluster 43 is 9 No. of records with gender 2 in cluster 43 is 6 Records found in cluster 44 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8109 1.0 1.0000 44 8112 0.0 1.0000 44 8113 2.0 0.6675 44 8116 2.0 0.6611 44 8118 1.0 1.0000 44 8122 2.0 0.6623 44 8123 2.0 0.6605 44 8128 0.0 1.0000 44 8132 2.0 0.6665 44 8146 1.0 1.0000 44 8159 2.0 1.0000 44 8165 0.0 1.0000 44 8176 1.0 1.0000 44 8177 2.0 1.0000 44 8178 2.0 1.0000 44 8185 2.0 1.0000 44 8190 2.0 0.6735 44 8191 1.0 0.3568 44 8192 2.0 0.6726 44 8199 2.0 1.0000 44 No. of records with gender 0 in cluster 44 is 3 No. of records with gender 1 in cluster 44 is 5 No. of records with gender 2 in cluster 44 is 12 Records found in cluster 45 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8313 0.0 1.0000 45 8322 1.0 1.0000 45 8327 0.0 0.6763 45 8331 2.0 0.6716 45 8333 2.0 1.0000 45 8337 1.0 1.0000 45 8338 0.0 1.0000 45 8339 0.0 1.0000 45 8340 2.0 0.6707 45 8341 1.0 0.6699 45 8353 2.0 0.6650 45 8356 1.0 0.6517 45 8358 2.0 0.6965 45 8384 0.0 1.0000 45 8385 1.0 1.0000 45 8391 0.0 1.0000 45 No. of records with gender 0 in cluster 45 is 6 No. of records with gender 1 in cluster 45 is 5 No. of records with gender 2 in cluster 45 is 5 Records found in cluster 46 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8401 0.0 0.6732 46 8402 2.0 0.6767 46 8403 2.0 0.6575 46 8407 0.0 0.6763 46 8411 1.0 1.0000 46 8412 1.0 0.6900 46 8429 1.0 1.0000 46 8460 2.0 0.6828 46 8466 0.0 1.0000 46 8470 1.0 1.0000 46 8478 0.0 1.0000 46 8479 2.0 0.3625 46 8487 0.0 0.6806 46 8489 0.0 1.0000 46 8496 0.0 1.0000 46 No. of records with gender 0 in cluster 46 is 7 No. of records with gender 1 in cluster 46 is 4 No. of records with gender 2 in cluster 46 is 4 Records found in cluster 47 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8607 2.0 0.6659 47 8613 2.0 1.0000 47 8616 2.0 1.0000 47 8617 2.0 0.6774 47 8619 0.0 0.6647 47 8620 2.0 0.6975 47 8622 0.0 0.6634 47 8623 2.0 0.6778 47 8624 1.0 1.0000 47 8627 2.0 0.6829 47 8632 2.0 1.0000 47 8638 0.0 1.0000 47 8642 2.0 0.6688 47 8645 2.0 0.6778 47 8647 2.0 1.0000 47 8675 2.0 1.0000 47 8676 1.0 0.6602 47 8677 0.0 0.6772 47 8679 2.0 1.0000 47 8680 2.0 1.0000 47 8681 0.0 0.6507 47 8688 2.0 0.3354 47 8690 2.0 1.0000 47 8691 2.0 0.3595 47 8694 2.0 0.6736 47 8699 0.0 1.0000 47 8749 0.0 0.6548 47 No. of records with gender 0 in cluster 47 is 7 No. of records with gender 1 in cluster 47 is 2 No. of records with gender 2 in cluster 47 is 18 Records found in cluster 48 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8701 1.0 1.0000 48 8711 2.0 1.0000 48 8728 0.0 1.0000 48 8732 2.0 0.6946 48 8739 0.0 1.0000 48 8744 2.0 1.0000 48 8746 2.0 0.6916 48 8764 2.0 0.6674 48 8765 1.0 0.6611 48 8767 0.0 1.0000 48 8769 2.0 1.0000 48 8772 0.0 0.6732 48 8777 0.0 1.0000 48 8779 2.0 1.0000 48 8782 1.0 1.0000 48 8783 2.0 1.0000 48 8784 2.0 1.0000 48 9648 0.0 1.0000 48 10111 2.0 1.0000 48 10551 2.0 0.6362 48 10903 1.0 1.0000 48 11265 1.0 1.0000 48 11650 0.0 1.0000 48 12295 0.0 1.0000 48 12731 2.0 1.0000 48 15770 0.0 0.6808 48 16201 2.0 1.0000 48 No. of records with gender 0 in cluster 48 is 9 No. of records with gender 1 in cluster 48 is 5 No. of records with gender 2 in cluster 48 is 13 Records found in cluster 49 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 8804 2.0 0.6561 49 8834 2.0 1.0000 49 8843 0.0 0.3571 49 8844 2.0 1.0000 49 8849 0.0 0.6906 49 8852 0.0 1.0000 49 8854 0.0 1.0000 49 8855 1.0 0.6440 49 8859 2.0 1.0000 49 8864 0.0 0.3421 49 8865 1.0 1.0000 49 8873 0.0 1.0000 49 8874 1.0 1.0000 49 8878 2.0 0.6640 49 8881 0.0 1.0000 49 8884 1.0 0.6612 49 8886 2.0 0.3536 49 17100 1.0 1.0000 49 No. of records with gender 0 in cluster 49 is 7 No. of records with gender 1 in cluster 49 is 5 No. of records with gender 2 in cluster 49 is 6 Records found in cluster 50 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 9105 2.0 0.6468 50 9109 0.0 0.6553 50 9112 1.0 1.0000 50 9113 0.0 1.0000 50 9115 2.0 0.6771 50 9118 2.0 0.6712 50 9123 2.0 1.0000 50 9125 2.0 1.0000 50 9130 2.0 0.6741 50 9136 2.0 1.0000 50 9144 2.0 1.0000 50 9150 1.0 1.0000 50 9151 1.0 0.6453 50 9152 0.0 1.0000 50 9165 0.0 1.0000 50 9166 2.0 1.0000 50 9178 2.0 0.6698 50 9190 1.0 1.0000 50 9194 2.0 1.0000 50 9195 1.0 1.0000 50 No. of records with gender 0 in cluster 50 is 4 No. of records with gender 1 in cluster 50 is 5 No. of records with gender 2 in cluster 50 is 11 Records found in cluster 51 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 9206 2.0 0.3398 51 9207 2.0 1.0000 51 9212 0.0 1.0000 51 9215 1.0 0.6818 51 9216 2.0 0.6519 51 9217 2.0 0.3376 51 9220 2.0 1.0000 51 9221 2.0 1.0000 51 9225 2.0 1.0000 51 9228 0.0 1.0000 51 9243 0.0 0.3506 51 9249 1.0 0.3542 51 9253 2.0 1.0000 51 9278 1.0 1.0000 51 9280 1.0 1.0000 51 9283 2.0 0.6659 51 9289 2.0 1.0000 51 9293 0.0 1.0000 51 9294 0.0 1.0000 51 11308 2.0 0.6412 51 No. of records with gender 0 in cluster 51 is 5 No. of records with gender 1 in cluster 51 is 4 No. of records with gender 2 in cluster 51 is 11 Records found in cluster 52 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 9515 0.0 0.6648 52 10396 1.0 1.0000 52 10608 1.0 1.0000 52 10796 0.0 0.6912 52 10981 0.0 1.0000 52 11477 2.0 1.0000 52 11770 2.0 1.0000 52 12451 2.0 1.0000 52 12803 1.0 0.6667 52 12996 1.0 1.0000 52 13263 2.0 0.6743 52 13436 0.0 1.0000 52 14141 0.0 1.0000 52 14290 0.0 1.0000 52 14473 0.0 1.0000 52 14878 2.0 0.6502 52 15088 0.0 0.6581 52 15727 2.0 1.0000 52 16605 0.0 0.6578 52 16973 0.0 1.0000 52 17197 1.0 1.0000 52 17330 0.0 1.0000 52 17728 1.0 0.6702 52 18071 2.0 1.0000 52 18531 2.0 1.0000 52 No. of records with gender 0 in cluster 52 is 11 No. of records with gender 1 in cluster 52 is 6 No. of records with gender 2 in cluster 52 is 8 Records found in cluster 53 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 9856 2.0 1.0000 53 10150 1.0 1.0000 53 10237 2.0 1.0000 53 10471 1.0 1.0000 53 10633 2.0 0.6545 53 10849 2.0 1.0000 53 10964 2.0 1.0000 53 11050 0.0 1.0000 53 11251 1.0 0.6715 53 11356 2.0 1.0000 53 11429 2.0 1.0000 53 11653 0.0 1.0000 53 11767 2.0 1.0000 53 11842 1.0 1.0000 53 11930 1.0 1.0000 53 12045 1.0 1.0000 53 12284 1.0 1.0000 53 12397 0.0 1.0000 53 12507 2.0 1.0000 53 12659 2.0 1.0000 53 12754 2.0 0.6615 53 No. of records with gender 0 in cluster 53 is 3 No. of records with gender 1 in cluster 53 is 7 No. of records with gender 2 in cluster 53 is 11 Records found in cluster 54 from DBSCAN in Exp 3 gender gender:confidence Cluster_Label 10812 1.0 0.6827 54 12073 1.0 1.0000 54 13106 1.0 0.6574 54 14855 2.0 1.0000 54 15950 2.0 1.0000 54 16388 2.0 1.0000 54 16854 2.0 1.0000 54 17041 1.0 1.0000 54 17154 1.0 1.0000 54 17297 0.0 1.0000 54 17565 1.0 1.0000 54 17677 1.0 1.0000 54 17868 2.0 0.3354 54 18092 0.0 1.0000 54 18246 1.0 1.0000 54 18302 1.0 1.0000 54 18399 0.0 1.0000 54 18527 1.0 1.0000 54 18646 0.0 1.0000 54 18759 0.0 0.6386 54 No. of records with gender 0 in cluster 54 is 5 No. of records with gender 1 in cluster 54 is 10 No. of records with gender 2 in cluster 54 is 5 Records classified as noise gender gender:confidence Cluster_Label 599 1.0 1.0000 -1 635 1.0 1.0000 -1 1268 2.0 1.0000 -1 1367 1.0 1.0000 -1 1544 0.0 1.0000 -1 2154 1.0 0.6561 -1 2243 2.0 1.0000 -1 2382 1.0 1.0000 -1 2682 1.0 0.6473 -1 2897 2.0 1.0000 -1 3341 1.0 1.0000 -1 3360 1.0 1.0000 -1 3526 1.0 1.0000 -1 3938 2.0 0.6545 -1 4051 2.0 1.0000 -1 4650 2.0 0.3571 -1 5424 0.0 1.0000 -1 5548 2.0 1.0000 -1 6140 2.0 0.6679 -1 6313 1.0 1.0000 -1 6616 1.0 1.0000 -1 6620 2.0 1.0000 -1 7107 2.0 0.6865 -1 7610 2.0 0.6578 -1 7651 0.0 0.6637 -1 8509 2.0 0.6731 -1 8579 2.0 1.0000 -1 8798 1.0 1.0000 -1 8836 0.0 0.6645 -1 9305 2.0 0.6606 -1 11119 1.0 1.0000 -1 11727 2.0 1.0000 -1 12333 1.0 1.0000 -1 12992 0.0 1.0000 -1 13486 2.0 1.0000 -1 14046 0.0 1.0000 -1 14958 2.0 1.0000 -1 15597 1.0 0.3362 -1 16706 0.0 1.0000 -1 17186 1.0 1.0000 -1 17599 0.0 0.6654 -1 18270 0.0 1.0000 -1 ---- VISUALIZE THE METRIC EVALUATION ----
REGRESSION¶
In [4]:
# =============================== REGRESSION ======================================
print()
print()
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)
print()
print("=" * 50)
print('Boosted Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# FEATURE IMPORTANCE
print()
print("Performing feature importance analysis...")
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
# print("desc_ column indices:", desc_columns)
# print("text_ column indices:", text_columns)
# print("desc_ array:\n", desc_array)
# print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
col_name = df_preprocessed_reg.columns[i]
new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)
# Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset
# filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
# plotting these columns
def scatterplot_mistaken_points(misclassified_df, X_train, model):
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Set the main title
fig.suptitle(f'{model}\nGender Confidence of "Mistaken" Records', fontsize=16)
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Training Set\nSample Size: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Not Training Set\nSample Size: {len(df_not_in_X_train)}')
plt.tight_layout()
plt.show()
def scatter_plot(y, y_tot_pred, model):
# Plotting more results results
plt.figure(figsize=(10, 8))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Predicted', fontsize=12)
plt.suptitle(model, fontsize=16)
plt.title('Gender Confidence Comparison', fontsize=14)
plt.show()
scatterplot_mistaken_points(misclassified_df, X_train, "Boosted Regression Tree with Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree with Vectorised Text/Desc Features")
# ==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
df_preprocessed_non_text2 = df_preprocessed_non_text.copy()
print(df_preprocessed_non_text)
print()
print("=" * 50)
print('Boosted Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# Get feature importances and plot from the model
print()
print("Performing feature importance analysis...")
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
'Feature': column_names,
'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset
# Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text, "Boosted Regression Tree without Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree without Vectorised Text/Desc Features")
# ====================================Analyzing with a linear regression (Least Squares Implementation)====================
print()
print("=" * 50)
print('Linear Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
# run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree with Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
# identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
misclassified_df_lin_reg = misclassified_df.copy()
scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree with Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree with Vectorised Text/Desc Features")
#================================Lin reg without text=======================================================
#================================Linear regression without text features============================
print()
print("=" * 50)
print('Linear Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)
X_train_lin = sm.add_constant(X_train_non_text)
X_test_lin = sm.add_constant(X_test_non_text)
df_preprocessed_lin = sm.add_constant(df_preprocessed_non_text2)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree without Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
#final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
#identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree without Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree without Vectorised Text/Desc Features")
# ================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]
scatterplot_mistaken_points(common_df, X_train_lin, "Boosted and Linear Regression Trees (Intersection) with Vectorised Text/Desc Features")
================================================== Boosted Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0266 Mean Squared Error (Test): 0.0290 Mean Squared Error (Total): 0.0280
Performing feature importance analysis... desc text favorites_per_day retweets_per_day tweets_per_day \ 0 0.308771 0.364314 0.021232 0.0 0.121167 profile_created_year tweet_created_year link_R link_G link_B \ 0 0.155415 0.0 0.000336 0.011339 0.000434 sidebar_R sidebar_G sidebar_B 0 0.005375 0.006886 0.00473
favorites_per_day retweets_per_day tweets_per_day \ 0 0.000000 0.000000 28.149163 1 0.015554 0.000000 1.708829 2 2.147321 0.000279 1.567243 3 0.036207 0.000000 0.303459 4 9.794751 0.000000 8.257743 ... ... ... ... 18831 0.090609 0.000000 0.234923 18832 0.568809 0.000000 3.060887 18833 0.011364 0.000000 6.004318 18834 16.333103 0.000000 12.934948 18835 0.878510 0.000000 0.766728 profile_created_year tweet_created_year link_R link_G link_B \ 0 2013 2015 8 194 194 1 2012 2015 0 132 180 2 2014 2015 171 184 194 3 2009 2015 0 132 180 4 2014 2015 59 148 217 ... ... ... ... ... ... 18831 2015 2015 0 132 180 18832 2012 2015 207 185 41 18833 2012 2015 0 132 180 18834 2012 2015 146 102 204 18835 2014 2015 0 132 180 sidebar_R sidebar_G sidebar_B 0 255 255 255 1 192 222 237 2 192 222 237 3 192 222 237 4 0 0 0 ... ... ... ... 18831 192 222 237 18832 0 0 0 18833 192 222 237 18834 0 0 0 18835 192 222 237 [18836 rows x 11 columns] ================================================== Boosted Regression Tree without Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0275 Mean Squared Error (Test): 0.0292 Mean Squared Error (Total): 0.0280
Performing feature importance analysis...
================================================== Linear Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0166 Mean Squared Error (Test): 0.0499 Mean Squared Error (Total): 0.0366
================================================== Linear Regression Tree without Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0292 Mean Squared Error (Test): 0.0305 Mean Squared Error (Total): 0.0300
CLASSIFICATION¶
In [5]:
# ============================== CLASSIFICATION ==============================
print()
print()
print('---- CLASSIFICATION ----')
# Features and target
X = df_preprocessed.drop(columns=['gender']) # Assuming 'gender' is the target variable
y = df_preprocessed['gender']
# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Predict on test data
y_pred_rf = rf_classifier.predict(X_test)
# Evaluate the performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
# Train the model
xgb_model.fit(X_train, y_train)
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
# Evaluate the model
print("\nXGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)
# Fit the model
lgb_clf.fit(X_train, y_train)
# Predict
y_pred_lgb = lgb_clf.predict(X_test)
# Evaluation
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))
# Helper function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, model_name):
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'{model_name} Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Helper function to extract and display classification report with model name
def get_classification_report(y_test, y_pred, model_name):
report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()
df['model'] = model_name
return df
# Random Forest Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
rf_report = get_classification_report(y_test, y_pred_rf, "Random Forest")
# XGBoost Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
xgb_report = get_classification_report(y_test, y_pred_xgb, "XGBoost")
# LightGBM Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_lgb, "LightGBM")
lgb_report = get_classification_report(y_test, y_pred_lgb, "LightGBM")
# Combine all reports
combined_report = pd.concat([rf_report, xgb_report, lgb_report])
# Debugging Step: Check the combined report structure
print("Combined Classification Report:\n", combined_report.head())
# Filter out rows for precision, recall, and f1-score
combined_report_filtered = combined_report[
combined_report.index.isin(['0', '1']) # Filter for the classes
].reset_index()
# Debugging Step: Check the filtered report structure
print("Filtered Report for Precision, Recall, and F1-Score:\n", combined_report_filtered.head())
# Plot Precision, Recall, and F1-Score for each model
metrics = ['precision', 'recall', 'f1-score']
for metric in metrics:
# Debugging Step: Filter for specific metric
print(f"Data for {metric}:")
print(combined_report_filtered[['index', metric, 'model']])
plt.figure(figsize=(10, 6))
sns.barplot(
x="index",
y=metric,
hue="model",
data=combined_report_filtered[['index', metric, 'model']]
)
plt.title(f'{metric.capitalize()} Comparison')
plt.ylabel(metric.capitalize())
plt.xlabel('Class (0 = Human, 1 = Non-Human)')
plt.show()
# Accuracy comparison
accuracies = {
'Random Forest': accuracy_score(y_test, y_pred_rf),
'XGBoost': accuracy_score(y_test, y_pred_xgb),
'LightGBM': accuracy_score(y_test, y_pred_lgb)
}
plt.figure(figsize=(6, 4))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()
---- CLASSIFICATION ---- Accuracy Score: 0.6242038216560509 Confusion Matrix: [[661 470 136] [284 932 102] [250 174 759]] Classification Report: precision recall f1-score support 0 0.55 0.52 0.54 1267 1 0.59 0.71 0.64 1318 2 0.76 0.64 0.70 1183 accuracy 0.62 3768 macro avg 0.64 0.62 0.63 3768 weighted avg 0.63 0.62 0.62 3768 XGBoost Classifier Report: precision recall f1-score support 0 0.56 0.54 0.55 1267 1 0.61 0.65 0.63 1318 2 0.72 0.67 0.69 1183 accuracy 0.62 3768 macro avg 0.63 0.62 0.62 3768 weighted avg 0.62 0.62 0.62 3768 Accuracy: 0.6220806794055201 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025491 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 36890 [LightGBM] [Info] Number of data points in the train set: 15068, number of used features: 1766 [LightGBM] [Info] Start training from score -1.117843 [LightGBM] [Info] Start training from score -1.029513 [LightGBM] [Info] Start training from score -1.152536 LightGBM Classification Report: precision recall f1-score support 0 0.57 0.55 0.56 1267 1 0.61 0.65 0.63 1318 2 0.72 0.69 0.70 1183 accuracy 0.63 3768 macro avg 0.63 0.63 0.63 3768 weighted avg 0.63 0.63 0.63 3768
Combined Classification Report: precision recall f1-score support model 0 0.553138 0.521705 0.536962 1267.000000 Random Forest 1 0.591371 0.707132 0.644091 1318.000000 Random Forest 2 0.761284 0.641589 0.696330 1183.000000 Random Forest accuracy 0.624204 0.624204 0.624204 0.624204 Random Forest macro avg 0.635264 0.623475 0.625794 3768.000000 Random Forest Filtered Report for Precision, Recall, and F1-Score: index precision recall f1-score support model 0 0 0.553138 0.521705 0.536962 1267.0 Random Forest 1 1 0.591371 0.707132 0.644091 1318.0 Random Forest 2 0 0.556275 0.542226 0.549161 1267.0 XGBoost 3 1 0.605356 0.651745 0.627695 1318.0 XGBoost 4 0 0.573061 0.554065 0.563403 1267.0 LightGBM Data for precision: index precision model 0 0 0.553138 Random Forest 1 1 0.591371 Random Forest 2 0 0.556275 XGBoost 3 1 0.605356 XGBoost 4 0 0.573061 LightGBM 5 1 0.609497 LightGBM
Data for recall: index recall model 0 0 0.521705 Random Forest 1 1 0.707132 Random Forest 2 0 0.542226 XGBoost 3 1 0.651745 XGBoost 4 0 0.554065 LightGBM 5 1 0.652504 LightGBM
Data for f1-score: index f1-score model 0 0 0.536962 Random Forest 1 1 0.644091 Random Forest 2 0 0.549161 XGBoost 3 1 0.627695 XGBoost 4 0 0.563403 LightGBM 5 1 0.630267 LightGBM
ASSOCIATION RULES¶
In [6]:
# ============================== ASSOCIATION RULES ==============================
print()
print()
print('---- ASSOCIATION RULES ----')
# Binarize numeric columns
df_asso['high_favorites'] = df_asso['favorites_per_day'] > df_asso['favorites_per_day'].mean()
df_asso['high_retweets'] = df_asso['retweets_per_day'] > df_asso['retweets_per_day'].mean()
df_asso['high_tweets'] = df_asso['tweets_per_day'] > df_asso['tweets_per_day'].mean()
# Binarize year columns (profile_created_year and tweet_created_year)
# Example: Set threshold year as 2015
df_asso['profile_recent'] = df_asso['profile_created_year'] >= 2015
df_asso['tweet_recent'] = df_asso['tweet_created_year'] >= 2015
# Select only the binary columns
df_apriori = df_asso[['high_favorites', 'high_retweets', 'high_tweets',
'profile_recent', 'tweet_recent',
'tweet_location_encoded', 'user_timezone_encoded']]
# Convert all columns to int (0 or 1)
df_apriori = df_apriori.astype(int)
# Apply Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)
# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Display the rules
print(rules)
top_frequent_itemsets = frequent_itemsets.nlargest(10, 'support')
plt.figure(figsize=(10, 6))
sns.barplot(x='support', y='itemsets', data=top_frequent_itemsets)
plt.title('Top 10 Frequent Itemsets by Support')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()
# ---------------------------
# Visualization 2: Scatter Plot of Association Rules by Confidence and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(x='confidence', y='lift', size='support', data=rules, hue='antecedents', palette='viridis', sizes=(40, 200))
plt.title('Association Rules: Confidence vs Lift')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.legend(title='Antecedents', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
# ---------------------------
# Visualization 3: Heatmap of Support, Confidence, and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.heatmap(rules[['support', 'confidence', 'lift']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Support, Confidence, and Lift')
plt.show()
---- ASSOCIATION RULES ---- antecedents consequents \ 0 (high_tweets) (high_favorites) 1 (high_favorites) (high_tweets) 2 (tweet_recent) (high_favorites) 3 (high_favorites) (tweet_recent) 4 (tweet_recent) (high_tweets) 5 (high_tweets) (tweet_recent) 6 (tweet_recent) (profile_recent) 7 (profile_recent) (tweet_recent) 8 (tweet_recent, high_tweets) (high_favorites) 9 (high_favorites, high_tweets) (tweet_recent) 10 (tweet_recent, high_favorites) (high_tweets) 11 (high_tweets) (tweet_recent, high_favorites) 12 (tweet_recent) (high_favorites, high_tweets) 13 (high_favorites) (tweet_recent, high_tweets) antecedent support consequent support support confidence lift \ 0 0.271767 0.210607 0.066097 0.243212 1.15481 1 0.210607 0.271767 0.066097 0.313839 1.15481 2 1.000000 0.210607 0.210607 0.210607 1.00000 3 0.210607 1.000000 0.210607 1.000000 1.00000 4 1.000000 0.271767 0.271767 0.271767 1.00000 5 0.271767 1.000000 0.271767 1.000000 1.00000 6 1.000000 0.175568 0.175568 0.175568 1.00000 7 0.175568 1.000000 0.175568 1.000000 1.00000 8 0.271767 0.210607 0.066097 0.243212 1.15481 9 0.066097 1.000000 0.066097 1.000000 1.00000 10 0.210607 0.271767 0.066097 0.313839 1.15481 11 0.271767 0.210607 0.066097 0.243212 1.15481 12 1.000000 0.066097 0.066097 0.066097 1.00000 13 0.210607 0.271767 0.066097 0.313839 1.15481 leverage conviction zhangs_metric 0 0.008861 1.043082 0.184085 1 0.008861 1.061316 0.169823 2 0.000000 1.000000 0.000000 3 0.000000 inf 0.000000 4 0.000000 1.000000 0.000000 5 0.000000 inf 0.000000 6 0.000000 1.000000 0.000000 7 0.000000 inf 0.000000 8 0.008861 1.043082 0.184085 9 0.000000 inf 0.000000 10 0.008861 1.061316 0.169823 11 0.008861 1.043082 0.184085 12 0.000000 1.000000 0.000000 13 0.008861 1.061316 0.169823
C:\Users\Owner\uowMaster\subject\946\venv_bda\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type warnings.warn(
AMENDMENT¶
In [9]:
# ============================== AMENDMENT ==============================
print()
print()
print('---- AMENDMENT ----')
mistaken_index = misclassified_df_reg.index.union(misclassified_df_lin_reg.index)
df_truth = df_preprocessed.copy()
df_mistaken = df_preprocessed.loc[mistaken_index].copy()
df_amended = df_mistaken.copy()
vectorized_features = [col for col in df_truth.columns if col.startswith('desc_') or col.startswith('text_')]
df_truth_vectors = df_truth[vectorized_features]
df_mistaken_vectors = df_mistaken[vectorized_features]
similarities = cosine_similarity(df_mistaken_vectors, df_truth_vectors)
best_matches_indices = similarities.argmax(axis=1)
df_amended['gender'] = df_truth.loc[best_matches_indices, 'gender'].values
## Comparative Analysis
# Calculate the number of changes made
num_changes = (df_amended['gender'] != df_mistaken['gender']).sum()
# Calculate the percentage of records amended
percent_amended = (num_changes / len(df_amended)) * 100
## Impact on Statistics
# Function to calculate gender distribution
def gender_distribution(df):
return df['gender'].value_counts(normalize=True) * 100
# Calculate gender distributions
original_dist = gender_distribution(df_mistaken)
amended_dist = gender_distribution(df_amended)
# Calculate the difference in distributions
dist_difference = amended_dist - original_dist
## Summary Report
print("Amendment Summary Report")
print("=======================")
print(f"Total records processed: {len(df_amended)}")
print(f"Number of records amended: {num_changes}")
print(f"Percentage of records amended: {percent_amended:.2f}%")
print("\nGender Distribution (%):")
print("------------------------")
print("Category Mistaken Amended")
for category in original_dist.index:
print(f"{category:<12} {original_dist.get(category, 0):.2f} {amended_dist.get(category, 0):.2f}")
print("\nDistribution Changes:")
print("---------------------")
for category in dist_difference.index:
print(f"{category}: {dist_difference[category]:+.2f}%")
## Create a figure with subplots
fig, axs = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle("Comparative Analysis of Gender Amendment", fontsize=20)
## 1. Bar plot: Gender Distribution Comparison
axs[0, 0].bar(original_dist.index, original_dist.values, alpha=0.5, label='Original')
axs[0, 0].bar(amended_dist.index, amended_dist.values, alpha=0.5, label='Amended')
axs[0, 0].set_title("Gender Distribution Comparison")
axs[0, 0].set_ylabel("Percentage")
axs[0, 0].legend()
## 2. Pie charts: Before and After Amendment
def plot_pie(ax, data, title):
ax.pie(data.values, labels=data.index, autopct='%1.1f%%', startangle=90)
ax.set_title(title)
plot_pie(axs[0, 1], original_dist, "Gender Distribution Before Amendment")
plot_pie(axs[1, 0], amended_dist, "Gender Distribution After Amendment")
## 3. Heatmap: Confusion Matrix
cm = confusion_matrix(df_mistaken['gender'], df_amended['gender'], labels=df_mistaken['gender'].unique())
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=df_mistaken['gender'].unique(),
yticklabels=df_mistaken['gender'].unique(), ax=axs[1, 1])
axs[1, 1].set_title("Confusion Matrix: After vs Before")
axs[1, 1].set_xlabel("After")
axs[1, 1].set_ylabel("Before")
## Adjust layout and save
plt.tight_layout()
plt.savefig('gender_amendment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
print("Visualizations have been saved as 'gender_amendment_analysis.png'")
---- AMENDMENT ---- Amendment Summary Report ======================= Total records processed: 3682 Number of records amended: 92 Percentage of records amended: 2.50% Gender Distribution (%): ------------------------ Category Mistaken Amended 1 35.99 34.95 2 33.49 34.71 0 30.53 30.34 Distribution Changes: --------------------- 1: -1.03% 2: +1.22% 0: -0.19%
Visualizations have been saved as 'gender_amendment_analysis.png'