Installs and Imports¶
In [8]:
import re
import string
import subprocess
import sys
import warnings
warnings.filterwarnings('ignore')
REQS = [
('pip', 'pip==24.2'),
('lightgbm', 'lightgbm==4.5.0'),
('matplotlib', 'matplotlib==3.9.2'),
('mlxtend', 'mlxtend==0.23.1'),
('nltk', 'nltk==3.9.1'),
('numpy', 'numpy==2.0.2'),
('optuna', 'optuna==4.0.0'),
('pandas', 'pandas==2.2.2'),
('seaborn', 'seaborn==0.13.2'),
('sklearn', 'scikit-learn==1.5.2'),
('statsmodels', 'statsmodels==0.14.3'),
('umap-learn', 'umap-learn==0.5.6'),
('xgboost', 'xgboost==2.1.1'),
]
try:
subprocess.check_call([sys.executable, '-m', 'ensurepip'])
except Exception as e:
print(e, file=sys.stderr)
def ensure_installed(module_info):
_, install_str = module_info
try:
subprocess.check_call([sys.executable, '-m',
'pip', 'install', '--quiet',
install_str])
print(f'Installed "{install_str}".')
except Exception as e:
print(e, file=sys.stderr)
for m in REQS:
ensure_installed(m)
# Standard libraries
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# Machine learning and data processing
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
accuracy_score,
calinski_harabasz_score,
classification_report,
confusion_matrix,
mean_squared_error,
silhouette_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Statistical modeling
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Dimensionality reduction
import umap
# Hyperparameter optimization
import optuna
# Other machine learning libraries
import lightgbm as lgb
from xgboost import XGBClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
def find_columns_with_missing(data, columns):
"""Finding features that have a lot of missing data"""
print()
print('Finding columns with missing data...')
data_cleaned = data
missing = []
i = 0
for col in columns:
missing.append(data[col].isnull().sum())
if missing[i] > 0:
print()
print(f'Column {col} is missing {missing[i]} values.')
print(f'Proportion of missing data is {missing[i]/len(data)}.')
if missing[i]/len(data) >= 0.9:
print(f'Dropping column {col}...')
data_cleaned = data_cleaned.drop(columns=col)
i += 1
return missing, data_cleaned
def hex_to_rgb(hex_color):
"""Function to convert hex to RGB"""
# Remove the '#' if it exists
hex_color = hex_color.lstrip('#')
# Convert hex to integer and split into RGB components
return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]
def preprocess_text(text):
"""Preprocessing function"""
text = text.lower()
# Remove punctuation and special characters
text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuation
text = re.sub(r'[^A-Za-z\s]', '', text)
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize the tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Join tokens back into a string
return ' '.join(tokens)
def plot_silhouette_bar_across_experiments(model_names, silhouette_scores):
n_experiments = len(silhouette_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
sil_scores = [exp_scores[i] for exp_scores in silhouette_scores]
plt.bar(index + i * bar_width,sil_scores, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Silhouette scores')
plt.title('Silhouette scores Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
def visualize_ch_index_across_experiments(model_names, ch_scores):
n_experiments = len(ch_scores)
n_models = len(model_names)
bar_width = 0.2
index = np.arange(n_experiments)
plt.figure(figsize=(12, 6))
for i, model_name in enumerate(model_names):
ch_score = [exp_scores[i] for exp_scores in ch_scores]
plt.bar(index + i * bar_width, ch_score, bar_width, label=model_name)
plt.xlabel('Experiments')
plt.ylabel('Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index Across Models and Experiments')
plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
plt.legend()
plt.tight_layout()
plt.show()
class KMeansClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.kmeans_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_kmeans(trial):
n_clusters = trial.suggest_int('n_clusters', 2, 10)
init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=42)
kmeans.fit(self.data)
labels = kmeans.labels_
score = silhouette_score(self.data, labels)
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_kmeans, n_trials=n_trials)
self.best_params = study.best_params
print("Best params:", self.best_params)
def fit_model(self):
self.kmeans_model = KMeans(n_clusters=self.best_params['n_clusters'],
init=self.best_params['init'],
random_state=42)
self.kmeans_model.fit(self.data)
def visualize_clusters(self, umap_embedding, feature):
labels = self.kmeans_model.labels_
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Scatter plot in 3D
scatter = ax.scatter(
umap_embedding[:, 0],
umap_embedding[:, 1],
umap_embedding[:, 2],
c=labels,
cmap='viridis',
s=30
)
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
plt.title(f'3D UMAP of K-Means Clusters on {feature}')
# Add a color bar for better visual distinction of clusters
plt.colorbar(scatter)
# Show the plot
plt.show()
def plot_elbow_method(self, k_range=(2, 10)):
"""
Plot the Elbow Method for choosing the optimal number of clusters
Args:
- k_range: tuple, range of cluster numbers to evaluate
"""
inertia = []
K = range(k_range[0], k_range[1] + 1)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(self.data)
inertia.append(kmeans.inertia_) # Sum of squared distances to closest cluster center
plt.figure(figsize=(8, 6))
plt.plot(K, inertia, 'bo-', markersize=8)
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (Sum of squared distances)')
plt.grid(True)
plt.show()
def output_label(self):
return self.kmeans_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.kmeans_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.kmeans_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.kmeans_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class DBSCANClustering:
def __init__(self, data):
self.data = data
self.best_params = None
self.dbscan_model = None
def tune_hyperparameters(self, n_trials=15):
def objective_dbscan(trial):
eps = trial.suggest_float('eps', 0.1, 2.0)
min_samples = trial.suggest_int('min_samples', 3, 20)
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(self.data)
labels = dbscan.labels_
if len(set(labels)) > 1:
score = silhouette_score(self.data, labels)
else:
score = -1
return score
study = optuna.create_study(direction="maximize")
study.optimize(objective_dbscan, n_trials=n_trials)
self.best_params = study.best_params
print("Found best params:", self.best_params)
def fit_model(self):
self.dbscan_model = DBSCAN(eps=self.best_params['eps'], min_samples=self.best_params['min_samples'])
self.dbscan_model.fit(self.data)
def visualize_clusters_and_outliers_3D(self, umap_embedding, feature):
labels = self.dbscan_model.labels_
# Separate clustered points and noise points
clustered_points = umap_embedding[labels >= 0] # Points part of a cluster
clustered_labels = labels[labels >= 0]
outliers = umap_embedding[labels == -1] # Noise points
# Create a 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
# Plot the clustered points in different colors
scatter = ax.scatter(clustered_points[:, 0], clustered_points[:, 1], clustered_points[:, 2],
c=clustered_labels, cmap='viridis', s=30)
# Plot the outliers (noise points) in red with 'x' markers
ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red', marker='x', s=80, label='Outliers')
# Add labels and title
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
ax.set_zlabel('UMAP Dimension 3')
ax.set_title(f'DBSCAN 3D Clusters with Outliers on {feature}')
# Add a legend and color bar for clusters
plt.legend()
plt.colorbar(scatter, ax=ax)
plt.show()
def output_label(self):
return self.dbscan_model.labels_
def silhoutte(self):
score = silhouette_score(self.data, self.dbscan_model.labels_)
print(f'The Silhouette score is {score}')
return score
def calinski(self):
if len(np.unique(self.dbscan_model.labels_)) > 1: # Only calculate if there are clusters
score = calinski_harabasz_score(self.data, self.dbscan_model.labels_)
else:
score = np.nan # If only one cluster (or all noise), set to NaN
print(f'The Callinski index is {score}')
return score
class ClusteringDataRetriever:
def __init__(self, data, labels):
self.data = data
self.labels = labels
def get_data_with_labels(self):
# If Data is in a numpy array, convert it to a pandas DataFrame
if isinstance(self.data, np.ndarray):
df = pd.DataFrame(self.data)
else:
df = self.data.copy() # If already a DataFrame
# Add a new column for the cluster labels
df['Cluster_Label'] = self.labels
return df[['gender', 'gender:confidence', 'Cluster_Label']]
def get_cluster_data(self, cluster_label):
# Retrieve data points belonging to a specific cluster.
df = self.get_data_with_labels()
return df[df['Cluster_Label'] == cluster_label]
def get_noise_data(self):
# Retrieve Records classified as noise (-1 label) in DBSCAN.
return self.get_cluster_data(-1)
Installed "pip==24.2". Installed "lightgbm==4.5.0". Installed "matplotlib==3.9.2". Installed "mlxtend==0.23.1". Installed "nltk==3.9.1". Installed "numpy==2.0.2". Installed "optuna==4.0.0". Installed "pandas==2.2.2". Installed "seaborn==0.13.2". Installed "scikit-learn==1.5.2". Installed "statsmodels==0.14.3". Installed "umap-learn==0.5.6". Installed "xgboost==2.1.1".
EDA¶
In [2]:
# Main starts here
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Quick view of the dataset
print()
print('Dataset Overview')
print(df.info())
print(df.head())
all_features = df.columns
missing_col, df_cleaned = find_columns_with_missing(df, all_features)
# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis
print()
print('Dataset Overview')
print(df_cleaned.info())
print(df_cleaned.head())
print()
print('---- EXPLORATORY DATA ANALYSIS (EDA) ----')
current_num_features = df.select_dtypes(include=[np.number])
# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
plt.title(f'Distribution of {feature} by Gender')
plt.show()
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
plt.figure(figsize=(8, 6))
sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
plt.show()
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
# assuming Data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days
df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()
# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()
# show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()
# Exploring 'link_color' and 'sidebar_color' features
# Check number of NaN value in 'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()
print()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}.")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}.")
# Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'Number of link color is {link_color_count}.')
print(f'Number of side bar color is {sidebar_color_count}.')
# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")
# top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
# print(top_sidebar_colors)
# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()
# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()
# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)
# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']
# categorise types of features
# numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()
# categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)
freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)
# gender features
# encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Check for unique values in the 'gender' column after replacement
print()
print("Unique Values in 'gender'")
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()
# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)
# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)
# Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])
# Check if all required features are there
print()
print('All Remaining Features')
print(df_preprocessed.columns.tolist())
# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
# print(f'All current numerical features are {numerical_features.columns.tolist()}')
print()
print('Dataset Overview After PreProcessing')
print(df_preprocessed.info())
print()
print('---- NLP Processing ----')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
# df_preprocessed['name'].fillna('', inplace=True)
# Check the text features if they still contain NaN
print()
print(df_preprocessed.select_dtypes(include=[object]))
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
# df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))
# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())
# Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])
# Initialize TFIDF vectorizer for text features
print()
print('Applying TF-IDF Vectorisation...')
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
# Apply TF-IDF on 'description', 'text', 'name' columns
tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
# tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()
# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
# tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])
# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)
# Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])
df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)
df_asso = df_preprocessed.copy()
df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 _unit_id 20050 non-null int64
1 _golden 20050 non-null bool
2 _unit_state 20050 non-null object
3 _trusted_judgments 20050 non-null int64
4 _last_judgment_at 20000 non-null object
5 gender 19953 non-null object
6 gender:confidence 20024 non-null float64
7 profile_yn 20050 non-null object
8 profile_yn:confidence 20050 non-null float64
9 created 20050 non-null object
10 description 16306 non-null object
11 fav_number 20050 non-null int64
12 gender_gold 50 non-null object
13 link_color 20050 non-null object
14 name 20050 non-null object
15 profile_yn_gold 50 non-null object
16 profileimage 20050 non-null object
17 retweet_count 20050 non-null int64
18 sidebar_color 20050 non-null object
19 text 20050 non-null object
20 tweet_coord 159 non-null object
21 tweet_count 20050 non-null int64
22 tweet_created 20050 non-null object
23 tweet_id 20050 non-null float64
24 tweet_location 12565 non-null object
25 user_timezone 12252 non-null object
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
None
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at \
0 815719226 False finalized 3 10/26/15 23:24
1 815719227 False finalized 3 10/26/15 23:30
2 815719228 False finalized 3 10/26/15 23:33
3 815719229 False finalized 3 10/26/15 23:10
4 815719230 False finalized 3 10/27/15 1:15
gender gender:confidence profile_yn profile_yn:confidence \
0 male 1.0000 yes 1.0
1 male 1.0000 yes 1.0
2 male 0.6625 yes 1.0
3 male 1.0000 yes 1.0
4 female 1.0000 yes 1.0
created ... profileimage \
0 12/5/13 1:48 ... https://pbs.twimg.com/profile_images/414342229...
1 10/1/12 13:51 ... https://pbs.twimg.com/profile_images/539604221...
2 11/28/14 11:30 ... https://pbs.twimg.com/profile_images/657330418...
3 6/11/09 22:39 ... https://pbs.twimg.com/profile_images/259703936...
4 4/16/14 13:23 ... https://pbs.twimg.com/profile_images/564094871...
retweet_count sidebar_color \
0 0 FFFFFF
1 0 C0DEED
2 1 C0DEED
3 0 C0DEED
4 0 0
text tweet_coord tweet_count \
0 Robbie E Responds To Critics After Win Against... NaN 110964
1 ÛÏIt felt like they were my friends and I was... NaN 7471
2 i absolutely adore when louis starts the songs... NaN 5617
3 Hi @JordanSpieth - Looking at the url - do you... NaN 1693
4 Watching Neighbours on Sky+ catching up with t... NaN 31462
tweet_created tweet_id tweet_location user_timezone
0 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai
1 10/26/15 12:40 6.587300e+17 NaN Eastern Time (US & Canada)
2 10/26/15 12:40 6.587300e+17 clcncl Belgrade
3 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada)
4 10/26/15 12:40 6.587300e+17 NaN NaN
[5 rows x 26 columns]
Finding columns with missing data...
Column _last_judgment_at is missing 50 values.
Proportion of missing data is 0.0024937655860349127.
Column gender is missing 97 values.
Proportion of missing data is 0.00483790523690773.
Column gender:confidence is missing 26 values.
Proportion of missing data is 0.0012967581047381546.
Column description is missing 3744 values.
Proportion of missing data is 0.18673316708229426.
Column gender_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column gender_gold...
Column profile_yn_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column profile_yn_gold...
Column tweet_coord is missing 19891 values.
Proportion of missing data is 0.992069825436409.
Dropping column tweet_coord...
Column tweet_location is missing 7485 values.
Proportion of missing data is 0.3733167082294264.
Column user_timezone is missing 7798 values.
Proportion of missing data is 0.388927680798005.
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 _unit_id 19953 non-null int64
1 _golden 19953 non-null bool
2 _unit_state 19953 non-null object
3 _trusted_judgments 19953 non-null int64
4 _last_judgment_at 19903 non-null object
5 gender 19953 non-null object
6 gender:confidence 19953 non-null float64
7 profile_yn:confidence 19953 non-null float64
8 created 19953 non-null object
9 description 16224 non-null object
10 fav_number 19953 non-null int64
11 link_color 19953 non-null object
12 name 19953 non-null object
13 profileimage 19953 non-null object
14 retweet_count 19953 non-null int64
15 sidebar_color 19953 non-null object
16 text 19953 non-null object
17 tweet_count 19953 non-null int64
18 tweet_created 19953 non-null object
19 tweet_id 19953 non-null float64
20 tweet_location 12510 non-null object
21 user_timezone 12185 non-null object
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
None
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at \
0 815719226 False finalized 3 10/26/15 23:24
1 815719227 False finalized 3 10/26/15 23:30
2 815719228 False finalized 3 10/26/15 23:33
3 815719229 False finalized 3 10/26/15 23:10
4 815719230 False finalized 3 10/27/15 1:15
gender gender:confidence profile_yn:confidence created \
0 male 1.0000 1.0 12/5/13 1:48
1 male 1.0000 1.0 10/1/12 13:51
2 male 0.6625 1.0 11/28/14 11:30
3 male 1.0000 1.0 6/11/09 22:39
4 female 1.0000 1.0 4/16/14 13:23
description ... name \
0 i sing my own rhythm. ... sheezy0
1 I'm the author of novels filled with family dr... ... DavdBurnett
2 louis whining and squealing and all ... lwtprettylaugh
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... douggarland
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... WilfordGemma
profileimage retweet_count \
0 https://pbs.twimg.com/profile_images/414342229... 0
1 https://pbs.twimg.com/profile_images/539604221... 0
2 https://pbs.twimg.com/profile_images/657330418... 1
3 https://pbs.twimg.com/profile_images/259703936... 0
4 https://pbs.twimg.com/profile_images/564094871... 0
sidebar_color text \
0 FFFFFF Robbie E Responds To Critics After Win Against...
1 C0DEED ÛÏIt felt like they were my friends and I was...
2 C0DEED i absolutely adore when louis starts the songs...
3 C0DEED Hi @JordanSpieth - Looking at the url - do you...
4 0 Watching Neighbours on Sky+ catching up with t...
tweet_count tweet_created tweet_id tweet_location \
0 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3
1 7471 10/26/15 12:40 6.587300e+17 NaN
2 5617 10/26/15 12:40 6.587300e+17 clcncl
3 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA
4 31462 10/26/15 12:40 6.587300e+17 NaN
user_timezone
0 Chennai
1 Eastern Time (US & Canada)
2 Belgrade
3 Pacific Time (US & Canada)
4 NaN
[5 rows x 22 columns]
---- EXPLORATORY DATA ANALYSIS (EDA) ----
Number of NaN values in 'link_color': 0. Number of NaN values in 'sidebar_color': 0. Number of link color is 2986. Number of side bar color is 559. Number of NaN values in 'link_color': 0 Number of NaN values in 'sidebar_color': 0
Unique Values in 'gender' [0 1 2] <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 link_color 18836 non-null object 5 retweets_per_day 18836 non-null float64 6 sidebar_color 18836 non-null object 7 text 18836 non-null object 8 tweets_per_day 18836 non-null float64 9 user_timezone 18836 non-null object 10 tweet_location 18836 non-null object 11 profile_created_year 18836 non-null int32 12 tweet_created_year 18836 non-null int32 13 tweet_location_encoded 18836 non-null float64 14 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(6) memory usage: 2.2+ MB None
All Remaining Features ['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded'] Dataset Overview After PreProcessing <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 retweets_per_day 18836 non-null float64 5 text 18836 non-null object 6 tweets_per_day 18836 non-null float64 7 profile_created_year 18836 non-null int32 8 tweet_created_year 18836 non-null int32 9 tweet_location_encoded 18836 non-null float64 10 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(2) memory usage: 1.6+ MB None ---- NLP Processing ----
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package punkt_tab is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Owner\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
description \
0 i sing my own rhythm.
1 I'm the author of novels filled with family dr...
2 louis whining and squealing and all
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
... ...
20045 (rp)
20046 Whatever you like, it's not a problem at all. ...
20047 #TeamBarcelona ..You look lost so you should f...
20048 Anti-statist; I homeschool my kids. Aspiring t...
20049 Teamwork makes the dream work.
text
0 Robbie E Responds To Critics After Win Against...
1 ÛÏIt felt like they were my friends and I was...
2 i absolutely adore when louis starts the songs...
3 Hi @JordanSpieth - Looking at the url - do you...
4 Watching Neighbours on Sky+ catching up with t...
... ...
20045 @lookupondeath ...Fine, and I'll drink tea too...
20046 Greg Hardy you a good player and all but don't...
20047 You can miss people and still never want to se...
20048 @bitemyapp i had noticed your tendency to pee ...
20049 I think for my APUSH creative project I'm goin...
[18836 rows x 2 columns]
description \
0 i sing my own rhythm.
1 I'm the author of novels filled with family dr...
2 louis whining and squealing and all
3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
cleaned_description \
0 sing rhythm
1 im author novel filled family drama romance
2 louis whining squealing
3 mobile guy er shazam google kleiner perkins ya...
4 ricky wilson best frontmankaiser chief best ba...
text \
0 Robbie E Responds To Critics After Win Against...
1 ÛÏIt felt like they were my friends and I was...
2 i absolutely adore when louis starts the songs...
3 Hi @JordanSpieth - Looking at the url - do you...
4 Watching Neighbours on Sky+ catching up with t...
cleaned_text
0 robbie e responds critic win eddie edward worl...
1 felt like friend living story httpstcoarngeyhn...
2 absolutely adore louis start song hit hard fee...
3 hi jordanspieth looking url use ifttt dont typ...
4 watching neighbour sky catching neighbs xxx xxx
Applying TF-IDF Vectorisation...
CLUSTERING¶
In [3]:
print()
print()
print('---- CLUSTERING MODELS ----')
print()
print("=" * 50)
print('EXP 1: USING ALL SELECTED FEATURES')
print("=" * 50)
sil_ex1 = []
cal_ex1 = []
# Drop the gender and categorical features before normalise
df_cat = df_cate.copy()
# Drop gender feature and categorical features
df_preprocessed = df_preprocessed.drop(columns=df_cat.columns)
df_finalised = df_preprocessed.drop(columns=['gender', 'gender:confidence'])
# Normalise every existing feature
scaler = StandardScaler()
df_finalised = pd.DataFrame(scaler.fit_transform(df_finalised), columns=df_finalised.columns)
df_finalised = pd.concat([df_finalised, df_cat, df_gender], axis=1)
# find the rows that contained NaN values and drop them
df_finalised = df_finalised.dropna()
data_exp1 = df_finalised
df_ex1 = df_finalised.drop(columns=['gender', 'gender:confidence'])
# Check the preprocessed dataset in the present
print()
print('Dataset for Exp 1')
print(df_ex1.info())
print()
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_vis = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(df_ex1)
umap_plot = umap_vis.fit_transform(df_ex1)
print(umap_embedding.shape)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding)
kmeans_clustering.tune_hyperparameters()
kmeans_exp1 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_plot, 'All feature types')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex1.append(kmeans_clustering.silhoutte())
cal_ex1.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(data_exp1, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 1')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 1')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding)
dbscan_clustering.tune_hyperparameters()
dbscan_exp1 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_plot, 'All feature types')
db_labels = dbscan_clustering.output_label()
sil_ex1.append(dbscan_clustering.silhoutte())
cal_ex1.append(dbscan_clustering.calinski())
# Initialize the class to retrieve data
db_retriever = ClusteringDataRetriever(data_exp1, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 1')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 1')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES')
print("=" * 50)
sil_ex2 = []
cal_ex2 = []
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_num.shape[0], chunk_size):
df_num.iloc[i:i + chunk_size] = scaler.fit_transform(df_num.iloc[i:i + chunk_size])
df_no_text = pd.concat([df_num, df_cate, df_gender], axis=1)
print()
print("Data with Only Numerical and Categorical Features")
print(df_no_text.info())
print()
df_no_text = df_no_text.dropna()
df_no_text_wg = df_no_text.copy()
print('Removing NaN values...')
# Drop gender feature before clustering
data_exp2 = df_no_text.drop(columns=['gender', 'gender:confidence'])
print('Dropping gender and gender:confidence...')
# Check No. of records after drop NaN values
print()
print("Dataset for Exp 2")
print(data_exp2.info())
print()
print(data_exp2.head())
# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(data_exp2)
print(umap_embedding.shape)
# umap_embedding = umap_embedding.astype(np.float32)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(data_exp2)
kmeans_clustering.tune_hyperparameters()
kmeans_exp2 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Numerical and categorical features') # Visualize clusters
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex2.append(kmeans_clustering.silhoutte())
cal_ex2.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_no_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 2')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 2')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(data_exp2)
dbscan_clustering.tune_hyperparameters() # Tune DBSCAN hyperparameters
dbscan_exp2 = dbscan_clustering.fit_model() # Fit the DBSCAN model
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'numerical and categorical features') # Plot 3D noise points and valid clusters
db_labels = dbscan_clustering.output_label()
sil_ex2.append(dbscan_clustering.silhoutte())
cal_ex2.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_no_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 2')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 2')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print("=" * 50)
print('EXP 3: USING ONLY TEXT FEATURES')
print("=" * 50)
sil_ex3 = []
cal_ex3 = []
# Merge with main dataframe
df_with_text = pd.concat([tfidf_desc_df, tfidf_text_df], axis=1)
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_with_text.shape[0], chunk_size):
df_with_text.iloc[i:i + chunk_size] = scaler.fit_transform(df_with_text.iloc[i:i + chunk_size])
df_with_text_wg = pd.concat([df_with_text, df_gender], axis=1)
# Drop NaN values before clustering
df_with_text_wg = df_with_text_wg.dropna()
data_exp3 = df_with_text_wg.drop(columns=['gender', 'gender:confidence'])
# Drop the gender features before clustering
print('Dataset for Exp 3')
print(data_exp3.info())
print()
print(data_exp3.head())
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_embedding_t = umap_model.fit_transform(data_exp3)
umap_embedding = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42).fit_transform(data_exp3)
# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding_t)
kmeans_clustering.tune_hyperparameters()
kmeans_exp3 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Text features')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex3.append(kmeans_clustering.silhoutte())
cal_ex3.append(kmeans_clustering.calinski())
k_retriever = ClusteringDataRetriever(df_with_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 3')
print(df_with_labels.head())
for label in np.unique(k_labels):
print()
print(f'Records found in cluster {label} from KMeans in Exp 3')
print(k_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
# DBSCANClustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding_t)
dbscan_clustering.tune_hyperparameters()
dbscan_exp3 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'Text features')
db_labels = dbscan_clustering.output_label()
sil_ex3.append(dbscan_clustering.silhoutte())
cal_ex3.append(dbscan_clustering.calinski())
db_retriever = ClusteringDataRetriever(df_with_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 3')
print(df_with_labels.head())
for label in np.unique(db_labels):
if label != -1:
print()
print(f'Records found in cluster {label} from DBSCAN in Exp 3')
print(db_retriever.get_cluster_data(label))
print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())
print()
print('---- VISUALIZE THE METRIC EVALUATION ----')
# Metric functions
model_names = ['KMeans', 'DBSCAN']
sil_scores = [sil_ex1, sil_ex2, sil_ex3]
cal_scores = [cal_ex1, cal_ex2, cal_ex3]
plot_silhouette_bar_across_experiments(model_names, sil_scores)
visualize_ch_index_across_experiments(model_names, cal_scores)
---- CLUSTERING MODELS ---- ================================================== EXP 1: USING ALL SELECTED FEATURES ================================================== Dataset for Exp 1 <class 'pandas.core.frame.DataFrame'> Index: 17702 entries, 0 to 18835 Columns: 3013 entries, favorites_per_day to user_timezone_encoded dtypes: float64(3013) memory usage: 407.1 MB None Applying UMAP for dim reduction...
[I 2024-09-20 16:20:19,495] A new study created in memory with name: no-name-f656c2a4-43f7-454b-87f6-e1b8bbb5ba19
(17702, 2) Performing K-Means Clustering...
[I 2024-09-20 16:20:24,756] Trial 0 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:29,547] Trial 1 finished with value: 0.40816256403923035 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:34,470] Trial 2 finished with value: 0.43370768427848816 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:39,242] Trial 3 finished with value: 0.4106582999229431 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:44,060] Trial 4 finished with value: 0.3901534974575043 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:48,864] Trial 5 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:53,940] Trial 6 finished with value: 0.44721555709838867 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:20:58,788] Trial 7 finished with value: 0.3933861553668976 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:03,521] Trial 8 finished with value: 0.4233592748641968 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:08,516] Trial 9 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 0 with value: 0.44721555709838867.
[I 2024-09-20 16:21:13,893] Trial 10 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:19,213] Trial 11 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:24,493] Trial 12 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:29,872] Trial 13 finished with value: 0.7726734280586243 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
[I 2024-09-20 16:21:35,112] Trial 14 finished with value: 0.43466559052467346 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 10 with value: 0.7726734280586243.
Best params: {'n_clusters': 2, 'init': 'random'}
The Silhouette score is 0.7726734280586243
The Callinski index is 20992.505859375
Dataset with Labels from KMeans in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from KMeans in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[16379 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5439
No. of records with gender 1 in cluster 0 is 5773
No. of records with gender 2 in cluster 0 is 5167
Records found in cluster 1 from KMeans in Exp 1
[I 2024-09-20 16:21:42,026] A new study created in memory with name: no-name-ad1593d8-66bc-4c0f-9d74-f56f96710d50
gender gender:confidence Cluster_Label 7 0.0 1.0000 1 33 0.0 1.0000 1 49 2.0 1.0000 1 56 1.0 0.6684 1 58 0.0 1.0000 1 ... ... ... ... 18738 2.0 1.0000 1 18753 0.0 0.6678 1 18759 0.0 0.6386 1 18789 0.0 1.0000 1 18803 1.0 1.0000 1 [1323 rows x 3 columns] No. of records with gender 0 in cluster 1 is 404 No. of records with gender 1 in cluster 1 is 428 No. of records with gender 2 in cluster 1 is 491 Performing DBSCAN Clustering...
[I 2024-09-20 16:21:48,312] Trial 0 finished with value: 0.3155621588230133 and parameters: {'eps': 1.5913067486466435, 'min_samples': 6}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:21:54,152] Trial 1 finished with value: 0.24721910059452057 and parameters: {'eps': 1.0376530894652887, 'min_samples': 18}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:22:00,118] Trial 2 finished with value: 0.2345193773508072 and parameters: {'eps': 1.08924832783019, 'min_samples': 7}. Best is trial 0 with value: 0.3155621588230133.
[I 2024-09-20 16:22:06,672] Trial 3 finished with value: 0.3255881667137146 and parameters: {'eps': 1.9565357155432446, 'min_samples': 4}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:13,131] Trial 4 finished with value: 0.32468611001968384 and parameters: {'eps': 1.9655521749248066, 'min_samples': 17}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:19,013] Trial 5 finished with value: 0.26063308119773865 and parameters: {'eps': 0.9674339846692939, 'min_samples': 14}. Best is trial 3 with value: 0.3255881667137146.
[I 2024-09-20 16:22:25,335] Trial 6 finished with value: 0.32788148522377014 and parameters: {'eps': 1.7693479090782473, 'min_samples': 9}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:31,052] Trial 7 finished with value: 0.24578818678855896 and parameters: {'eps': 0.7826789736238435, 'min_samples': 19}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:36,540] Trial 8 finished with value: -0.14658115804195404 and parameters: {'eps': 0.34017243144029763, 'min_samples': 4}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:42,106] Trial 9 finished with value: 0.0954396203160286 and parameters: {'eps': 0.490850883967341, 'min_samples': 20}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:48,410] Trial 10 finished with value: 0.24460361897945404 and parameters: {'eps': 1.4333032533727734, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:22:54,901] Trial 11 finished with value: 0.32556405663490295 and parameters: {'eps': 1.9767937461657843, 'min_samples': 10}. Best is trial 6 with value: 0.32788148522377014.
[I 2024-09-20 16:23:01,305] Trial 12 finished with value: 0.33137843012809753 and parameters: {'eps': 1.6198251417047203, 'min_samples': 3}. Best is trial 12 with value: 0.33137843012809753.
[I 2024-09-20 16:23:07,690] Trial 13 finished with value: 0.32246026396751404 and parameters: {'eps': 1.528098496701475, 'min_samples': 13}. Best is trial 12 with value: 0.33137843012809753.
[I 2024-09-20 16:23:14,050] Trial 14 finished with value: 0.3302082121372223 and parameters: {'eps': 1.6778064207338765, 'min_samples': 8}. Best is trial 12 with value: 0.33137843012809753.
Found best params: {'eps': 1.6198251417047203, 'min_samples': 3}
The Silhouette score is 0.33137843012809753
The Callinski index is 1748.1387939453125
Dataset with Labels from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[15976 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5308
No. of records with gender 1 in cluster 0 is 5667
No. of records with gender 2 in cluster 0 is 5001
Records found in cluster 1 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7 0.0 1.0000 1
33 0.0 1.0000 1
49 2.0 1.0000 1
56 1.0 0.6684 1
58 0.0 1.0000 1
132 1.0 1.0000 1
153 2.0 1.0000 1
191 2.0 0.6804 1
192 0.0 1.0000 1
199 1.0 1.0000 1
231 1.0 1.0000 1
243 0.0 1.0000 1
250 2.0 1.0000 1
288 1.0 0.6494 1
308 1.0 0.6752 1
390 1.0 0.6786 1
460 2.0 0.6708 1
503 0.0 1.0000 1
No. of records with gender 0 in cluster 1 is 6
No. of records with gender 1 in cluster 1 is 7
No. of records with gender 2 in cluster 1 is 5
Records found in cluster 2 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
282 1.0 1.0000 2
2135 2.0 1.0000 2
2929 0.0 1.0000 2
3229 0.0 1.0000 2
3770 0.0 1.0000 2
... ... ... ...
9194 2.0 1.0000 2
9195 1.0 1.0000 2
9220 2.0 1.0000 2
9283 2.0 0.6659 2
9293 0.0 1.0000 2
[180 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 55
No. of records with gender 1 in cluster 2 is 48
No. of records with gender 2 in cluster 2 is 77
Records found in cluster 3 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
302 1.0 1.0000 3
1402 0.0 0.3539 3
2347 2.0 0.6757 3
2964 1.0 1.0000 3
4898 0.0 1.0000 3
5276 2.0 0.6632 3
5379 0.0 1.0000 3
5536 2.0 0.6943 3
5949 1.0 0.6848 3
6017 1.0 0.3486 3
6245 2.0 1.0000 3
6298 0.0 1.0000 3
6374 2.0 1.0000 3
6466 2.0 1.0000 3
6882 0.0 0.6879 3
6904 2.0 0.6842 3
7434 2.0 1.0000 3
7625 0.0 1.0000 3
7662 0.0 1.0000 3
7745 1.0 1.0000 3
7811 2.0 0.6341 3
7910 2.0 1.0000 3
8159 2.0 1.0000 3
8331 2.0 0.6716 3
8340 2.0 0.6707 3
8401 0.0 0.6732 3
8487 0.0 0.6806 3
8489 0.0 1.0000 3
8505 1.0 1.0000 3
8535 2.0 1.0000 3
8583 0.0 1.0000 3
8622 0.0 0.6634 3
8623 2.0 0.6778 3
8647 2.0 1.0000 3
8690 2.0 1.0000 3
8764 2.0 0.6674 3
8784 2.0 1.0000 3
8859 2.0 1.0000 3
8925 0.0 1.0000 3
8930 2.0 1.0000 3
8971 1.0 1.0000 3
9001 1.0 1.0000 3
9055 1.0 1.0000 3
9076 2.0 1.0000 3
9089 1.0 1.0000 3
9118 2.0 0.6712 3
9166 2.0 1.0000 3
9280 1.0 1.0000 3
14662 2.0 1.0000 3
15096 2.0 0.3410 3
15533 1.0 0.6619 3
15979 0.0 1.0000 3
16380 0.0 1.0000 3
16802 2.0 0.3531 3
17226 1.0 1.0000 3
17617 1.0 1.0000 3
18272 0.0 0.6686 3
No. of records with gender 0 in cluster 3 is 16
No. of records with gender 1 in cluster 3 is 14
No. of records with gender 2 in cluster 3 is 27
Records found in cluster 4 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
426 2.0 1.0000 4
432 0.0 1.0000 4
1992 0.0 1.0000 4
2776 0.0 1.0000 4
3755 2.0 1.0000 4
3769 2.0 0.6497 4
3784 2.0 1.0000 4
4418 1.0 1.0000 4
5352 1.0 1.0000 4
9341 2.0 1.0000 4
9379 0.0 1.0000 4
10138 1.0 1.0000 4
10451 0.0 0.6824 4
13349 0.0 1.0000 4
14425 0.0 0.6628 4
14668 2.0 1.0000 4
16449 1.0 1.0000 4
16881 1.0 0.6733 4
No. of records with gender 0 in cluster 4 is 7
No. of records with gender 1 in cluster 4 is 5
No. of records with gender 2 in cluster 4 is 6
Records found in cluster 5 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
431 0.0 0.6631 5
4374 2.0 1.0000 5
4456 1.0 1.0000 5
4653 2.0 1.0000 5
5008 2.0 1.0000 5
5044 2.0 1.0000 5
5220 2.0 0.6650 5
5533 2.0 1.0000 5
5580 0.0 1.0000 5
5596 2.0 1.0000 5
5662 1.0 1.0000 5
5749 2.0 1.0000 5
5988 2.0 1.0000 5
6669 0.0 1.0000 5
7261 0.0 1.0000 5
7702 2.0 0.7012 5
7771 2.0 1.0000 5
7898 2.0 1.0000 5
8120 1.0 1.0000 5
8248 1.0 1.0000 5
8295 2.0 0.6579 5
8360 2.0 0.6854 5
8984 2.0 0.6890 5
9100 0.0 1.0000 5
No. of records with gender 0 in cluster 5 is 5
No. of records with gender 1 in cluster 5 is 4
No. of records with gender 2 in cluster 5 is 15
Records found in cluster 6 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
502 0.0 1.0000 6
578 1.0 1.0000 6
644 0.0 1.0000 6
771 0.0 1.0000 6
963 2.0 1.0000 6
1433 1.0 1.0000 6
1881 0.0 0.6691 6
2762 2.0 0.6670 6
2903 1.0 0.6763 6
3308 0.0 0.3364 6
3353 0.0 1.0000 6
3681 2.0 1.0000 6
3830 0.0 1.0000 6
4305 1.0 1.0000 6
5040 0.0 1.0000 6
5479 0.0 0.6857 6
5742 0.0 1.0000 6
6460 2.0 1.0000 6
6862 1.0 1.0000 6
8397 2.0 0.6634 6
8516 2.0 0.6839 6
8918 2.0 1.0000 6
No. of records with gender 0 in cluster 6 is 10
No. of records with gender 1 in cluster 6 is 5
No. of records with gender 2 in cluster 6 is 7
Records found in cluster 7 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
513 2.0 1.0000 7
514 0.0 1.0000 7
520 0.0 0.3458 7
553 0.0 1.0000 7
554 0.0 0.3431 7
555 0.0 1.0000 7
556 0.0 1.0000 7
557 0.0 1.0000 7
560 1.0 1.0000 7
564 1.0 1.0000 7
565 1.0 1.0000 7
566 2.0 0.6829 7
576 0.0 1.0000 7
577 2.0 1.0000 7
1102 1.0 0.6777 7
2660 0.0 0.3478 7
4100 2.0 1.0000 7
4344 2.0 1.0000 7
4370 0.0 1.0000 7
4426 2.0 0.6838 7
4444 0.0 0.6422 7
4489 1.0 1.0000 7
4643 0.0 1.0000 7
4781 2.0 0.6475 7
4896 2.0 1.0000 7
4950 1.0 1.0000 7
4967 0.0 1.0000 7
5030 0.0 1.0000 7
5176 1.0 1.0000 7
5256 2.0 0.6475 7
5355 0.0 1.0000 7
5356 0.0 1.0000 7
5427 1.0 1.0000 7
5448 2.0 0.6654 7
7995 2.0 1.0000 7
8037 0.0 0.6374 7
8233 0.0 1.0000 7
10824 0.0 1.0000 7
No. of records with gender 0 in cluster 7 is 19
No. of records with gender 1 in cluster 7 is 8
No. of records with gender 2 in cluster 7 is 11
Records found in cluster 8 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
548 2.0 0.6672 8
4512 0.0 1.0000 8
7351 2.0 0.6667 8
7473 1.0 1.0000 8
10589 0.0 0.6623 8
12139 0.0 1.0000 8
12845 0.0 1.0000 8
12988 2.0 0.6557 8
14702 2.0 1.0000 8
17727 0.0 1.0000 8
No. of records with gender 0 in cluster 8 is 5
No. of records with gender 1 in cluster 8 is 1
No. of records with gender 2 in cluster 8 is 4
Records found in cluster 9 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
570 2.0 0.6616 9
3168 1.0 1.0000 9
11317 2.0 1.0000 9
11909 1.0 1.0000 9
14448 0.0 1.0000 9
14613 0.0 1.0000 9
14791 1.0 1.0000 9
15015 1.0 1.0000 9
15216 0.0 1.0000 9
No. of records with gender 0 in cluster 9 is 3
No. of records with gender 1 in cluster 9 is 4
No. of records with gender 2 in cluster 9 is 2
Records found in cluster 10 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
575 0.0 1.0000 10
1308 0.0 0.6479 10
2033 1.0 1.0000 10
2308 1.0 0.6774 10
3898 0.0 1.0000 10
5454 2.0 0.6774 10
5539 1.0 1.0000 10
5628 2.0 1.0000 10
5825 1.0 1.0000 10
5847 2.0 0.6717 10
6012 0.0 1.0000 10
6048 2.0 0.6796 10
6108 0.0 1.0000 10
6114 1.0 0.6620 10
6335 2.0 1.0000 10
6382 2.0 0.6842 10
6417 2.0 1.0000 10
7843 2.0 1.0000 10
8181 0.0 1.0000 10
8355 2.0 0.6778 10
8738 0.0 1.0000 10
No. of records with gender 0 in cluster 10 is 7
No. of records with gender 1 in cluster 10 is 5
No. of records with gender 2 in cluster 10 is 9
Records found in cluster 11 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
599 1.0 1.0000 11
1268 2.0 1.0000 11
2138 1.0 1.0000 11
2145 0.0 1.0000 11
2146 1.0 1.0000 11
2147 1.0 1.0000 11
2148 1.0 0.3576 11
2156 0.0 1.0000 11
2166 1.0 1.0000 11
2168 0.0 0.6825 11
2169 1.0 1.0000 11
2171 1.0 1.0000 11
2172 0.0 1.0000 11
2182 2.0 1.0000 11
2185 0.0 1.0000 11
2186 0.0 0.3403 11
2187 1.0 1.0000 11
2188 2.0 0.6812 11
2189 0.0 0.6582 11
2191 0.0 1.0000 11
2194 1.0 1.0000 11
2196 1.0 1.0000 11
2204 1.0 0.6587 11
2205 0.0 0.6685 11
2206 1.0 0.6551 11
2207 1.0 1.0000 11
2210 1.0 1.0000 11
2216 1.0 0.6896 11
2217 1.0 0.6832 11
2220 1.0 1.0000 11
2223 2.0 1.0000 11
2682 1.0 0.6473 11
2860 0.0 1.0000 11
2862 0.0 1.0000 11
2863 0.0 0.3370 11
2866 2.0 0.6497 11
2870 2.0 0.6368 11
2872 0.0 0.6855 11
2873 1.0 0.6940 11
3360 1.0 1.0000 11
5548 2.0 1.0000 11
6616 1.0 1.0000 11
7610 2.0 0.6578 11
8509 2.0 0.6731 11
9305 2.0 0.6606 11
10714 0.0 1.0000 11
12324 1.0 1.0000 11
14170 1.0 1.0000 11
15223 0.0 1.0000 11
16735 0.0 0.6563 11
No. of records with gender 0 in cluster 11 is 16
No. of records with gender 1 in cluster 11 is 24
No. of records with gender 2 in cluster 11 is 10
Records found in cluster 12 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
822 0.0 0.6473 12
1536 2.0 0.6591 12
11119 1.0 1.0000 12
11627 2.0 0.6796 12
11727 2.0 1.0000 12
12333 1.0 1.0000 12
12992 0.0 1.0000 12
13486 2.0 1.0000 12
13980 0.0 1.0000 12
14046 0.0 1.0000 12
14958 2.0 1.0000 12
15597 1.0 0.3362 12
16706 0.0 1.0000 12
17090 0.0 1.0000 12
17186 1.0 1.0000 12
17599 0.0 0.6654 12
18270 0.0 1.0000 12
No. of records with gender 0 in cluster 12 is 8
No. of records with gender 1 in cluster 12 is 4
No. of records with gender 2 in cluster 12 is 5
Records found in cluster 13 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
941 2.0 0.6582 13
9955 2.0 1.0000 13
10078 0.0 1.0000 13
10115 0.0 1.0000 13
10194 1.0 1.0000 13
10234 2.0 0.3388 13
10298 0.0 0.3387 13
10354 2.0 0.6852 13
10391 1.0 1.0000 13
15703 1.0 1.0000 13
17106 0.0 1.0000 13
17709 0.0 1.0000 13
No. of records with gender 0 in cluster 13 is 5
No. of records with gender 1 in cluster 13 is 3
No. of records with gender 2 in cluster 13 is 4
Records found in cluster 14 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1040 1.0 1.0000 14
1045 2.0 0.6789 14
1049 1.0 1.0000 14
1051 2.0 1.0000 14
1052 1.0 1.0000 14
1054 1.0 1.0000 14
1061 0.0 1.0000 14
1064 1.0 0.6498 14
1065 0.0 1.0000 14
No. of records with gender 0 in cluster 14 is 2
No. of records with gender 1 in cluster 14 is 5
No. of records with gender 2 in cluster 14 is 2
Records found in cluster 15 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1108 1.0 0.6880 15
9382 2.0 1.0000 15
9398 1.0 1.0000 15
9475 0.0 1.0000 15
9496 0.0 1.0000 15
... ... ... ...
15207 1.0 1.0000 15
15391 2.0 1.0000 15
15439 2.0 1.0000 15
15622 2.0 1.0000 15
18398 0.0 0.6709 15
[70 rows x 3 columns]
No. of records with gender 0 in cluster 15 is 19
No. of records with gender 1 in cluster 15 is 25
No. of records with gender 2 in cluster 15 is 26
Records found in cluster 16 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1203 1.0 1.0000 16
1240 1.0 0.6889 16
2115 0.0 1.0000 16
2381 0.0 1.0000 16
3988 2.0 1.0000 16
5994 2.0 0.6611 16
7988 1.0 0.6734 16
8071 1.0 1.0000 16
10735 0.0 1.0000 16
10738 0.0 1.0000 16
11076 2.0 1.0000 16
11179 2.0 1.0000 16
11484 1.0 1.0000 16
11648 1.0 1.0000 16
11746 0.0 1.0000 16
12054 1.0 1.0000 16
13078 0.0 1.0000 16
14056 2.0 1.0000 16
15064 0.0 0.6534 16
15751 1.0 1.0000 16
15757 1.0 1.0000 16
16465 0.0 1.0000 16
16868 1.0 1.0000 16
17448 0.0 1.0000 16
18208 0.0 1.0000 16
18753 0.0 0.6678 16
No. of records with gender 0 in cluster 16 is 11
No. of records with gender 1 in cluster 16 is 10
No. of records with gender 2 in cluster 16 is 5
Records found in cluster 17 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1273 0.0 1.0000 17
1605 2.0 1.0000 17
1761 2.0 1.0000 17
1845 1.0 1.0000 17
1987 1.0 1.0000 17
2274 0.0 1.0000 17
3961 0.0 1.0000 17
4092 0.0 0.3411 17
4424 2.0 1.0000 17
5218 2.0 1.0000 17
5336 1.0 1.0000 17
5445 0.0 1.0000 17
5927 2.0 0.6721 17
5980 0.0 1.0000 17
6262 2.0 1.0000 17
6289 1.0 1.0000 17
7003 1.0 1.0000 17
7118 2.0 1.0000 17
7431 1.0 1.0000 17
7540 0.0 0.6859 17
7791 1.0 1.0000 17
8142 2.0 1.0000 17
8601 2.0 0.6700 17
8693 0.0 1.0000 17
9023 1.0 0.6654 17
9265 1.0 1.0000 17
No. of records with gender 0 in cluster 17 is 8
No. of records with gender 1 in cluster 17 is 9
No. of records with gender 2 in cluster 17 is 9
Records found in cluster 18 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1367 1.0 1.0000 18
2382 1.0 1.0000 18
2897 2.0 1.0000 18
3526 1.0 1.0000 18
4051 2.0 1.0000 18
6140 2.0 0.6679 18
7107 2.0 0.6865 18
7913 2.0 1.0000 18
8836 0.0 0.6645 18
No. of records with gender 0 in cluster 18 is 1
No. of records with gender 1 in cluster 18 is 3
No. of records with gender 2 in cluster 18 is 5
Records found in cluster 19 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1544 0.0 1.0000 19
2154 1.0 0.6561 19
3341 1.0 1.0000 19
3938 2.0 0.6545 19
4650 2.0 0.3571 19
5424 0.0 1.0000 19
6313 1.0 1.0000 19
8798 1.0 1.0000 19
No. of records with gender 0 in cluster 19 is 2
No. of records with gender 1 in cluster 19 is 4
No. of records with gender 2 in cluster 19 is 2
Records found in cluster 20 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
1844 2.0 1.0000 20
4712 0.0 1.0000 20
5611 2.0 0.6856 20
6066 2.0 0.6668 20
6133 0.0 0.6655 20
6204 0.0 1.0000 20
6291 2.0 1.0000 20
6299 0.0 0.3604 20
6478 2.0 0.6611 20
6668 0.0 1.0000 20
6786 2.0 0.6694 20
7058 1.0 1.0000 20
7102 0.0 1.0000 20
7130 2.0 1.0000 20
7158 1.0 1.0000 20
7176 1.0 1.0000 20
7210 0.0 0.6617 20
7228 2.0 0.6766 20
7259 0.0 1.0000 20
7300 1.0 1.0000 20
7304 1.0 1.0000 20
7332 2.0 0.6573 20
7417 1.0 1.0000 20
7441 1.0 1.0000 20
7502 1.0 0.6617 20
7507 0.0 0.6848 20
7629 2.0 1.0000 20
7697 1.0 1.0000 20
7738 2.0 1.0000 20
7751 2.0 1.0000 20
7759 2.0 1.0000 20
7830 1.0 1.0000 20
7908 2.0 1.0000 20
7975 0.0 1.0000 20
7977 2.0 0.6739 20
7980 2.0 1.0000 20
7987 0.0 1.0000 20
8165 0.0 1.0000 20
8236 0.0 1.0000 20
8264 0.0 1.0000 20
8333 2.0 1.0000 20
8884 1.0 0.6612 20
8947 2.0 1.0000 20
8951 0.0 0.6752 20
9028 1.0 0.6849 20
9225 2.0 1.0000 20
9249 1.0 0.3542 20
11400 1.0 1.0000 20
No. of records with gender 0 in cluster 20 is 15
No. of records with gender 1 in cluster 20 is 14
No. of records with gender 2 in cluster 20 is 19
Records found in cluster 21 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
2445 1.0 1.0000 21
4210 2.0 1.0000 21
4595 1.0 1.0000 21
4621 1.0 1.0000 21
4685 2.0 1.0000 21
... ... ... ...
15313 2.0 1.0000 21
15316 2.0 1.0000 21
15322 0.0 1.0000 21
15324 2.0 0.6344 21
15338 1.0 0.6791 21
[124 rows x 3 columns]
No. of records with gender 0 in cluster 21 is 28
No. of records with gender 1 in cluster 21 is 34
No. of records with gender 2 in cluster 21 is 62
Records found in cluster 22 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3385 1.0 1.0000 22
3386 1.0 0.6628 22
3388 2.0 1.0000 22
3391 0.0 0.6612 22
3393 1.0 1.0000 22
3394 1.0 1.0000 22
3396 1.0 1.0000 22
3397 0.0 1.0000 22
3398 2.0 1.0000 22
3400 1.0 0.6727 22
3401 2.0 1.0000 22
3402 0.0 1.0000 22
3406 0.0 0.6819 22
3407 1.0 1.0000 22
3411 0.0 1.0000 22
3412 1.0 1.0000 22
3413 1.0 0.7023 22
No. of records with gender 0 in cluster 22 is 5
No. of records with gender 1 in cluster 22 is 9
No. of records with gender 2 in cluster 22 is 3
Records found in cluster 23 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3581 0.0 1.0000 23
3705 2.0 0.6581 23
3809 2.0 1.0000 23
3906 1.0 0.6422 23
4041 0.0 1.0000 23
4108 2.0 1.0000 23
4111 2.0 1.0000 23
4113 1.0 1.0000 23
4114 1.0 1.0000 23
4116 1.0 1.0000 23
4117 0.0 1.0000 23
4121 0.0 1.0000 23
4134 0.0 0.6692 23
4135 0.0 0.3619 23
4136 2.0 1.0000 23
4137 1.0 1.0000 23
4138 0.0 1.0000 23
4152 2.0 1.0000 23
4153 0.0 1.0000 23
4154 1.0 1.0000 23
4156 1.0 1.0000 23
4272 2.0 1.0000 23
4341 0.0 1.0000 23
4410 2.0 1.0000 23
4508 1.0 1.0000 23
4631 2.0 1.0000 23
4736 2.0 1.0000 23
4840 2.0 1.0000 23
5305 1.0 1.0000 23
No. of records with gender 0 in cluster 23 is 9
No. of records with gender 1 in cluster 23 is 9
No. of records with gender 2 in cluster 23 is 11
Records found in cluster 24 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
3744 0.0 0.6440 24
3927 0.0 1.0000 24
3994 1.0 1.0000 24
4057 2.0 0.3516 24
4300 2.0 0.6736 24
4398 1.0 1.0000 24
4470 2.0 0.6602 24
4544 0.0 1.0000 24
4640 2.0 1.0000 24
4800 2.0 0.6575 24
4883 2.0 1.0000 24
5043 1.0 1.0000 24
5238 1.0 1.0000 24
5325 1.0 0.6645 24
5515 2.0 1.0000 24
5659 1.0 1.0000 24
5978 2.0 1.0000 24
6188 2.0 0.6748 24
6440 2.0 1.0000 24
6562 0.0 1.0000 24
6671 2.0 1.0000 24
6749 1.0 1.0000 24
6826 2.0 0.6933 24
7050 0.0 0.6736 24
No. of records with gender 0 in cluster 24 is 5
No. of records with gender 1 in cluster 24 is 7
No. of records with gender 2 in cluster 24 is 12
Records found in cluster 25 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4012 1.0 1.0000 25
4097 0.0 0.6706 25
4177 0.0 0.6729 25
4219 0.0 1.0000 25
4226 2.0 1.0000 25
... ... ... ...
5777 2.0 0.6638 25
5809 0.0 1.0000 25
5849 0.0 0.6792 25
5881 2.0 1.0000 25
5910 0.0 0.6787 25
[94 rows x 3 columns]
No. of records with gender 0 in cluster 25 is 33
No. of records with gender 1 in cluster 25 is 23
No. of records with gender 2 in cluster 25 is 38
Records found in cluster 26 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4498 0.0 1.0000 26
6783 2.0 1.0000 26
10814 0.0 1.0000 26
14468 1.0 1.0000 26
14630 1.0 1.0000 26
14664 2.0 1.0000 26
14804 1.0 1.0000 26
15040 1.0 1.0000 26
15267 1.0 0.6608 26
16204 1.0 1.0000 26
No. of records with gender 0 in cluster 26 is 2
No. of records with gender 1 in cluster 26 is 6
No. of records with gender 2 in cluster 26 is 2
Records found in cluster 27 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4572 1.0 1.0000 27
4606 0.0 1.0000 27
4627 2.0 1.0000 27
4690 0.0 0.6763 27
4746 1.0 1.0000 27
... ... ... ...
8052 0.0 0.7050 27
8391 0.0 1.0000 27
8411 1.0 1.0000 27
18789 0.0 1.0000 27
18803 1.0 1.0000 27
[148 rows x 3 columns]
No. of records with gender 0 in cluster 27 is 46
No. of records with gender 1 in cluster 27 is 36
No. of records with gender 2 in cluster 27 is 66
Records found in cluster 28 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4772 2.0 1.0000 28
4789 2.0 1.0000 28
4853 0.0 1.0000 28
4917 1.0 0.6571 28
4949 2.0 1.0000 28
... ... ... ...
9206 2.0 0.3398 28
9215 1.0 0.6818 28
9253 2.0 1.0000 28
9278 1.0 1.0000 28
9294 0.0 1.0000 28
[127 rows x 3 columns]
No. of records with gender 0 in cluster 28 is 31
No. of records with gender 1 in cluster 28 is 31
No. of records with gender 2 in cluster 28 is 65
Records found in cluster 29 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4965 2.0 0.6695 29
5384 2.0 1.0000 29
5485 2.0 1.0000 29
5683 2.0 1.0000 29
5800 1.0 1.0000 29
7510 0.0 1.0000 29
8081 2.0 1.0000 29
8479 2.0 0.3625 29
8557 0.0 1.0000 29
8655 1.0 1.0000 29
8987 2.0 1.0000 29
9070 0.0 1.0000 29
9289 2.0 1.0000 29
9313 2.0 0.6841 29
10058 2.0 1.0000 29
10070 1.0 1.0000 29
10084 0.0 1.0000 29
10092 1.0 1.0000 29
10102 2.0 1.0000 29
10116 2.0 1.0000 29
10131 0.0 1.0000 29
10143 2.0 1.0000 29
10167 1.0 0.3495 29
11175 0.0 1.0000 29
No. of records with gender 0 in cluster 29 is 6
No. of records with gender 1 in cluster 29 is 5
No. of records with gender 2 in cluster 29 is 13
Records found in cluster 30 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
4995 2.0 1.0000 30
5372 2.0 1.0000 30
5627 2.0 0.6559 30
5919 2.0 1.0000 30
6208 1.0 0.6543 30
6496 2.0 0.6716 30
7060 1.0 0.6890 30
7439 0.0 1.0000 30
7683 1.0 0.6699 30
7894 0.0 1.0000 30
7902 0.0 1.0000 30
8408 0.0 1.0000 30
8933 1.0 1.0000 30
10448 2.0 0.6544 30
No. of records with gender 0 in cluster 30 is 4
No. of records with gender 1 in cluster 30 is 4
No. of records with gender 2 in cluster 30 is 6
Records found in cluster 31 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
5147 1.0 1.0000 31
17729 0.0 1.0000 31
17730 0.0 1.0000 31
17731 2.0 1.0000 31
17733 0.0 1.0000 31
17734 1.0 1.0000 31
17737 0.0 1.0000 31
17739 1.0 1.0000 31
17741 0.0 1.0000 31
17761 0.0 1.0000 31
17766 1.0 1.0000 31
17767 2.0 0.6637 31
17768 1.0 1.0000 31
17823 2.0 1.0000 31
17827 1.0 0.6773 31
17874 2.0 1.0000 31
17875 1.0 1.0000 31
17898 1.0 1.0000 31
17901 0.0 0.6676 31
17928 0.0 1.0000 31
17931 0.0 1.0000 31
17932 1.0 1.0000 31
17933 1.0 0.6807 31
17956 0.0 1.0000 31
17962 2.0 1.0000 31
17969 0.0 1.0000 31
17974 0.0 1.0000 31
17975 1.0 1.0000 31
17991 1.0 1.0000 31
18042 1.0 1.0000 31
18047 1.0 1.0000 31
18049 2.0 1.0000 31
18052 0.0 0.6660 31
18055 1.0 1.0000 31
18057 1.0 0.6557 31
18062 1.0 1.0000 31
18117 0.0 0.6664 31
18118 0.0 1.0000 31
18124 0.0 1.0000 31
18170 0.0 1.0000 31
18175 1.0 1.0000 31
18215 2.0 0.6545 31
18218 1.0 1.0000 31
18228 1.0 1.0000 31
18229 0.0 0.6827 31
18230 2.0 1.0000 31
18231 0.0 1.0000 31
18233 1.0 0.3352 31
18236 0.0 1.0000 31
18354 0.0 1.0000 31
18368 1.0 1.0000 31
18371 2.0 1.0000 31
18373 0.0 1.0000 31
18374 1.0 1.0000 31
No. of records with gender 0 in cluster 31 is 22
No. of records with gender 1 in cluster 31 is 23
No. of records with gender 2 in cluster 31 is 9
Records found in cluster 32 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
5206 1.0 1.0000 32
5629 2.0 1.0000 32
5640 0.0 1.0000 32
5944 1.0 1.0000 32
6093 1.0 0.6653 32
6157 2.0 0.6567 32
6174 2.0 0.6619 32
6409 0.0 1.0000 32
6514 1.0 1.0000 32
13356 1.0 1.0000 32
No. of records with gender 0 in cluster 32 is 2
No. of records with gender 1 in cluster 32 is 5
No. of records with gender 2 in cluster 32 is 3
Records found in cluster 33 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
5665 1.0 1.0000 33
6109 0.0 1.0000 33
6206 1.0 1.0000 33
6381 1.0 1.0000 33
6390 1.0 1.0000 33
6502 0.0 1.0000 33
6576 2.0 1.0000 33
6580 2.0 1.0000 33
6664 2.0 1.0000 33
6685 2.0 1.0000 33
6789 2.0 1.0000 33
6858 1.0 1.0000 33
6876 0.0 1.0000 33
6992 2.0 1.0000 33
7040 1.0 1.0000 33
7043 0.0 1.0000 33
7065 2.0 1.0000 33
7109 1.0 1.0000 33
7148 0.0 0.6750 33
7273 1.0 1.0000 33
7399 0.0 0.3272 33
7421 2.0 0.6802 33
7430 2.0 0.6812 33
7440 0.0 1.0000 33
7581 1.0 1.0000 33
7586 0.0 1.0000 33
7611 0.0 0.6666 33
7614 2.0 0.6866 33
7622 2.0 1.0000 33
7626 0.0 1.0000 33
7655 1.0 1.0000 33
7669 2.0 1.0000 33
7679 1.0 1.0000 33
7705 1.0 1.0000 33
7757 2.0 1.0000 33
7793 0.0 0.6691 33
7817 0.0 1.0000 33
7820 0.0 1.0000 33
7827 2.0 0.3472 33
7888 2.0 0.6506 33
7897 0.0 0.6803 33
7959 0.0 0.6823 33
8033 0.0 0.6701 33
8055 0.0 1.0000 33
8062 1.0 1.0000 33
8118 1.0 1.0000 33
8177 2.0 1.0000 33
8251 0.0 0.6624 33
8358 2.0 0.6965 33
8385 1.0 1.0000 33
8466 0.0 1.0000 33
8470 1.0 1.0000 33
No. of records with gender 0 in cluster 33 is 19
No. of records with gender 1 in cluster 33 is 16
No. of records with gender 2 in cluster 33 is 17
Records found in cluster 34 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
5853 2.0 0.6619 34
6244 2.0 1.0000 34
8255 2.0 0.6672 34
9773 0.0 0.6607 34
10211 1.0 1.0000 34
10698 1.0 0.6795 34
12736 1.0 0.6619 34
14216 1.0 1.0000 34
14307 2.0 0.6617 34
15333 1.0 1.0000 34
15424 0.0 0.6608 34
15800 1.0 1.0000 34
16873 1.0 1.0000 34
17596 1.0 1.0000 34
18337 1.0 1.0000 34
No. of records with gender 0 in cluster 34 is 2
No. of records with gender 1 in cluster 34 is 9
No. of records with gender 2 in cluster 34 is 4
Records found in cluster 35 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
6080 1.0 1.0000 35
7002 2.0 1.0000 35
7016 0.0 1.0000 35
7091 1.0 0.6642 35
7095 2.0 1.0000 35
... ... ... ...
9150 1.0 1.0000 35
9165 0.0 1.0000 35
9216 2.0 0.6519 35
9221 2.0 1.0000 35
9243 0.0 0.3506 35
[62 rows x 3 columns]
No. of records with gender 0 in cluster 35 is 13
No. of records with gender 1 in cluster 35 is 20
No. of records with gender 2 in cluster 35 is 29
Records found in cluster 36 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7289 0.0 1.0000 36
12796 1.0 1.0000 36
13303 1.0 1.0000 36
13417 1.0 1.0000 36
13502 1.0 1.0000 36
13716 1.0 0.6830 36
13901 2.0 0.6611 36
14140 0.0 0.6645 36
14214 2.0 1.0000 36
14269 2.0 0.6868 36
14337 1.0 1.0000 36
14412 1.0 1.0000 36
14483 0.0 1.0000 36
14645 1.0 1.0000 36
15443 2.0 1.0000 36
15534 0.0 1.0000 36
15807 0.0 1.0000 36
15916 1.0 1.0000 36
16188 1.0 1.0000 36
16418 2.0 1.0000 36
16672 1.0 1.0000 36
16725 1.0 1.0000 36
17269 0.0 1.0000 36
17351 1.0 0.6556 36
17442 1.0 1.0000 36
17842 0.0 1.0000 36
18412 2.0 0.6690 36
18510 1.0 1.0000 36
18731 1.0 1.0000 36
18738 2.0 1.0000 36
No. of records with gender 0 in cluster 36 is 7
No. of records with gender 1 in cluster 36 is 16
No. of records with gender 2 in cluster 36 is 7
Records found in cluster 37 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
7381 2.0 1.0000 37
7470 1.0 0.6810 37
7542 0.0 1.0000 37
7616 2.0 0.6675 37
7675 2.0 1.0000 37
7744 2.0 0.6761 37
7795 1.0 0.6602 37
7871 2.0 1.0000 37
7946 1.0 1.0000 37
8010 1.0 1.0000 37
8069 1.0 1.0000 37
8125 1.0 1.0000 37
8180 1.0 0.6850 37
8253 2.0 1.0000 37
8395 1.0 1.0000 37
8477 1.0 1.0000 37
8532 1.0 1.0000 37
8587 2.0 1.0000 37
8657 1.0 1.0000 37
8755 0.0 0.6707 37
8810 0.0 1.0000 37
8906 1.0 0.7047 37
8977 1.0 1.0000 37
9039 1.0 1.0000 37
9101 0.0 0.3496 37
9172 0.0 1.0000 37
9247 2.0 0.6622 37
9317 0.0 1.0000 37
17122 2.0 0.6583 37
No. of records with gender 0 in cluster 37 is 6
No. of records with gender 1 in cluster 37 is 14
No. of records with gender 2 in cluster 37 is 9
Records found in cluster 38 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
9515 0.0 0.6648 38
10396 1.0 1.0000 38
10608 1.0 1.0000 38
10796 0.0 0.6912 38
10981 0.0 1.0000 38
11477 2.0 1.0000 38
11770 2.0 1.0000 38
12451 2.0 1.0000 38
12803 1.0 0.6667 38
12996 1.0 1.0000 38
13263 2.0 0.6743 38
13436 0.0 1.0000 38
14141 0.0 1.0000 38
14290 0.0 1.0000 38
14473 0.0 1.0000 38
14878 2.0 0.6502 38
15088 0.0 0.6581 38
15727 2.0 1.0000 38
16605 0.0 0.6578 38
16973 0.0 1.0000 38
17197 1.0 1.0000 38
17330 0.0 1.0000 38
17728 1.0 0.6702 38
18071 2.0 1.0000 38
18531 2.0 1.0000 38
No. of records with gender 0 in cluster 38 is 11
No. of records with gender 1 in cluster 38 is 6
No. of records with gender 2 in cluster 38 is 8
Records found in cluster 39 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
9856 2.0 1.0000 39
10008 0.0 1.0000 39
10075 1.0 1.0000 39
10150 1.0 1.0000 39
10237 2.0 1.0000 39
10318 2.0 1.0000 39
10385 1.0 0.3592 39
10471 1.0 1.0000 39
10633 2.0 0.6545 39
10716 0.0 0.6794 39
10776 0.0 1.0000 39
10849 2.0 1.0000 39
10964 2.0 1.0000 39
11050 0.0 1.0000 39
11118 2.0 0.6666 39
11190 1.0 1.0000 39
11251 1.0 0.6715 39
11356 2.0 1.0000 39
11429 2.0 1.0000 39
11502 2.0 1.0000 39
11590 0.0 1.0000 39
11653 0.0 1.0000 39
11767 2.0 1.0000 39
11842 1.0 1.0000 39
11930 1.0 1.0000 39
12045 1.0 1.0000 39
12132 1.0 0.6858 39
12195 0.0 0.6564 39
12284 1.0 1.0000 39
12397 0.0 1.0000 39
12507 2.0 1.0000 39
12659 2.0 1.0000 39
12754 2.0 0.6615 39
No. of records with gender 0 in cluster 39 is 8
No. of records with gender 1 in cluster 39 is 11
No. of records with gender 2 in cluster 39 is 14
Records found in cluster 40 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
10710 1.0 1.0000 40
11127 2.0 1.0000 40
11929 0.0 1.0000 40
12857 0.0 1.0000 40
12921 1.0 1.0000 40
12962 1.0 1.0000 40
13047 2.0 0.6509 40
13110 0.0 0.6414 40
13132 0.0 0.6527 40
13159 2.0 1.0000 40
13221 1.0 1.0000 40
13254 0.0 0.6618 40
13289 0.0 0.6711 40
17886 2.0 1.0000 40
No. of records with gender 0 in cluster 40 is 6
No. of records with gender 1 in cluster 40 is 4
No. of records with gender 2 in cluster 40 is 4
Records found in cluster 41 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
10888 1.0 1.0000 41
12233 2.0 0.3374 41
13608 1.0 1.0000 41
14053 0.0 1.0000 41
14500 2.0 0.3449 41
15128 1.0 1.0000 41
15717 2.0 1.0000 41
16776 2.0 1.0000 41
No. of records with gender 0 in cluster 41 is 1
No. of records with gender 1 in cluster 41 is 3
No. of records with gender 2 in cluster 41 is 4
Records found in cluster 42 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
16388 2.0 1.0000 42
17041 1.0 1.0000 42
17154 1.0 1.0000 42
17297 0.0 1.0000 42
17565 1.0 1.0000 42
17677 1.0 1.0000 42
17868 2.0 0.3354 42
18092 0.0 1.0000 42
18246 1.0 1.0000 42
18399 0.0 1.0000 42
18527 1.0 1.0000 42
18646 0.0 1.0000 42
18759 0.0 0.6386 42
No. of records with gender 0 in cluster 42 is 5
No. of records with gender 1 in cluster 42 is 6
No. of records with gender 2 in cluster 42 is 2
Records found in cluster 43 from DBSCAN in Exp 1
gender gender:confidence Cluster_Label
17732 2.0 0.3417 43
17735 0.0 1.0000 43
17736 0.0 1.0000 43
17738 1.0 1.0000 43
17740 1.0 1.0000 43
... ... ... ...
18367 2.0 1.0000 43
18369 2.0 1.0000 43
18370 0.0 0.6591 43
18372 2.0 1.0000 43
18375 0.0 1.0000 43
[98 rows x 3 columns]
No. of records with gender 0 in cluster 43 is 44
No. of records with gender 1 in cluster 43 is 35
No. of records with gender 2 in cluster 43 is 19
Records classified as noise
Empty DataFrame
Columns: [gender, gender:confidence, Cluster_Label]
Index: []
==================================================
EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES
==================================================
Data with Only Numerical and Categorical Features
<class 'pandas.core.frame.DataFrame'>
Index: 19970 entries, 0 to 18833
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 retweets_per_day 18836 non-null float64
1 favorites_per_day 18836 non-null float64
2 tweets_per_day 18836 non-null float64
3 profile_created_year 18836 non-null float64
4 tweet_created_year 18836 non-null float64
5 tweet_location_encoded 18836 non-null float64
6 user_timezone_encoded 18836 non-null float64
7 gender 18836 non-null float64
8 gender:confidence 18836 non-null float64
dtypes: float64(9)
memory usage: 1.5 MB
None
Removing NaN values...
Dropping gender and gender:confidence...
Dataset for Exp 2
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 retweets_per_day 17702 non-null float64
1 favorites_per_day 17702 non-null float64
2 tweets_per_day 17702 non-null float64
3 profile_created_year 17702 non-null float64
4 tweet_created_year 17702 non-null float64
5 tweet_location_encoded 17702 non-null float64
6 user_timezone_encoded 17702 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB
None
retweets_per_day favorites_per_day tweets_per_day profile_created_year \
0 -0.100504 -0.318861 1.467429 0.497680
1 -0.100504 -0.313379 -0.582882 0.028171
2 9.949874 0.437997 -0.593862 0.967189
3 -0.100504 -0.306100 -0.691862 -1.380358
4 -0.100504 3.133457 -0.075048 0.967189
tweet_created_year tweet_location_encoded user_timezone_encoded
0 0.0 0.000053 0.001699
1 0.0 0.363294 0.127309
2 0.0 0.000053 0.002071
3 0.0 0.000159 0.105755
4 0.0 0.363294 0.381344
Applying UMAP for dim reduction...
[I 2024-09-20 16:23:48,010] A new study created in memory with name: no-name-f9550dca-d1e6-48ba-a676-9e841b5672d2
(17702, 3) Performing K-Means Clustering...
[I 2024-09-20 16:23:51,598] Trial 0 finished with value: 0.3289636639097889 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.3289636639097889.
[I 2024-09-20 16:23:55,202] Trial 1 finished with value: 0.37002461668723785 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 1 with value: 0.37002461668723785.
[I 2024-09-20 16:23:58,694] Trial 2 finished with value: 0.34236426949671833 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 1 with value: 0.37002461668723785.
[I 2024-09-20 16:24:02,334] Trial 3 finished with value: 0.42740275911790543 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543.
[I 2024-09-20 16:24:05,980] Trial 4 finished with value: 0.3327138746593811 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 3 with value: 0.42740275911790543.
[I 2024-09-20 16:24:09,807] Trial 5 finished with value: 0.43672661408383706 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 5 with value: 0.43672661408383706.
[I 2024-09-20 16:24:13,417] Trial 6 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 5 with value: 0.43672661408383706.
[I 2024-09-20 16:24:17,670] Trial 7 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:21,464] Trial 8 finished with value: 0.4278636679375091 and parameters: {'n_clusters': 5, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:25,371] Trial 9 finished with value: 0.39882111670484854 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:29,537] Trial 10 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:33,153] Trial 11 finished with value: 0.3527991920640622 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:37,183] Trial 12 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:41,210] Trial 13 finished with value: 0.7076370412066645 and parameters: {'n_clusters': 2, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
[I 2024-09-20 16:24:44,823] Trial 14 finished with value: 0.35844394490438636 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 7 with value: 0.7076370412066645.
Best params: {'n_clusters': 2, 'init': 'random'}
[I 2024-09-20 16:24:49,788] A new study created in memory with name: no-name-9347cc24-2c6f-4ad3-b792-5f023401ccdb
The Silhouette score is 0.7076370412066645
The Callinski index is 4482.755124226919
Dataset with Labels from KMeans in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 1
1 0.0 1.0000 1
2 0.0 0.6625 0
3 0.0 1.0000 1
4 1.0 1.0000 1
Records found in cluster 0 from KMeans in Exp 2
gender gender:confidence Cluster_Label
2 0.0 0.6625 0
257 1.0 1.0000 0
286 2.0 1.0000 0
392 2.0 0.6576 0
429 1.0 1.0000 0
... ... ... ...
18649 0.0 1.0000 0
18720 0.0 1.0000 0
18765 1.0 1.0000 0
18784 2.0 1.0000 0
18796 0.0 0.6760 0
[371 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 135
No. of records with gender 1 in cluster 0 is 103
No. of records with gender 2 in cluster 0 is 133
Records found in cluster 1 from KMeans in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 1
1 0.0 1.0000 1
3 0.0 1.0000 1
4 1.0 1.0000 1
5 1.0 1.0000 1
... ... ... ...
18829 1.0 1.0000 1
18831 0.0 0.6466 1
18832 1.0 1.0000 1
18834 1.0 1.0000 1
18835 0.0 0.6772 1
[17331 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 5708
No. of records with gender 1 in cluster 1 is 6098
No. of records with gender 2 in cluster 1 is 5525
Performing DBSCAN Clustering...
[I 2024-09-20 16:24:59,626] Trial 0 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8612774559273246, 'min_samples': 17}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:09,593] Trial 1 finished with value: 0.7395551461504506 and parameters: {'eps': 1.8222959595129316, 'min_samples': 16}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:19,610] Trial 2 finished with value: 0.7286394402690954 and parameters: {'eps': 1.4969908447691442, 'min_samples': 15}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:27,226] Trial 3 finished with value: 0.5245068941307643 and parameters: {'eps': 0.572252884608168, 'min_samples': 14}. Best is trial 0 with value: 0.7395551461504506.
[I 2024-09-20 16:25:37,404] Trial 4 finished with value: 0.7535863974003295 and parameters: {'eps': 1.9348801547784897, 'min_samples': 14}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:25:47,748] Trial 5 finished with value: 0.7378543022519933 and parameters: {'eps': 1.6937724424520773, 'min_samples': 17}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:25:57,605] Trial 6 finished with value: 0.7283183555413514 and parameters: {'eps': 1.8007992794733476, 'min_samples': 5}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:05,851] Trial 7 finished with value: 0.41821414951083047 and parameters: {'eps': 0.7293348370290341, 'min_samples': 7}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:14,792] Trial 8 finished with value: 0.7014382428001366 and parameters: {'eps': 1.0030585304662154, 'min_samples': 9}. Best is trial 4 with value: 0.7535863974003295.
[I 2024-09-20 16:26:24,590] Trial 9 finished with value: 0.7561451190368602 and parameters: {'eps': 1.871623880808503, 'min_samples': 8}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:33,948] Trial 10 finished with value: 0.5090875645314461 and parameters: {'eps': 1.2834097282566614, 'min_samples': 3}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:43,291] Trial 11 finished with value: 0.7175191411766557 and parameters: {'eps': 1.2828100669180271, 'min_samples': 11}. Best is trial 9 with value: 0.7561451190368602.
[I 2024-09-20 16:26:53,530] Trial 12 finished with value: 0.7582461095915987 and parameters: {'eps': 1.9873182476645224, 'min_samples': 12}. Best is trial 12 with value: 0.7582461095915987.
[I 2024-09-20 16:26:58,050] Trial 13 finished with value: -0.4771117854083832 and parameters: {'eps': 0.133931940094827, 'min_samples': 11}. Best is trial 12 with value: 0.7582461095915987.
[I 2024-09-20 16:27:07,751] Trial 14 finished with value: 0.727389258590315 and parameters: {'eps': 1.5756102900269209, 'min_samples': 20}. Best is trial 12 with value: 0.7582461095915987.
Found best params: {'eps': 1.9873182476645224, 'min_samples': 12}
The Silhouette score is 0.7582461095915987
The Callinski index is 336.17121436944564
Dataset with Labels from DBSCAN in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 2
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[17677 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5832
No. of records with gender 1 in cluster 0 is 6195
No. of records with gender 2 in cluster 0 is 5650
Records classified as noise
gender gender:confidence Cluster_Label
1116 2.0 1.0000 -1
2115 0.0 1.0000 -1
2502 0.0 0.6785 -1
2869 2.0 0.6489 -1
3301 0.0 1.0000 -1
4127 2.0 1.0000 -1
4150 1.0 1.0000 -1
5613 1.0 1.0000 -1
6722 1.0 1.0000 -1
7666 2.0 1.0000 -1
9210 0.0 1.0000 -1
10926 0.0 0.6513 -1
12010 0.0 1.0000 -1
12504 0.0 1.0000 -1
12668 0.0 1.0000 -1
13204 1.0 1.0000 -1
13331 1.0 1.0000 -1
13788 1.0 1.0000 -1
14567 2.0 1.0000 -1
15940 0.0 1.0000 -1
16326 2.0 0.3515 -1
17960 0.0 1.0000 -1
18012 0.0 1.0000 -1
18585 2.0 1.0000 -1
18763 2.0 1.0000 -1
==================================================
EXP 3: USING ONLY TEXT FEATURES
==================================================
Dataset for Exp 3
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3000 entries, desc_0 to text_1499
dtypes: float64(3000)
memory usage: 405.3 MB
None
desc_0 desc_1 desc_2 desc_3 desc_4 desc_5 desc_6 desc_7 desc_8 \
0 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 -0.142028 0.0 0.0 0.0 0.0 0.0
desc_9 ... text_1490 text_1491 text_1492 text_1493 text_1494 \
0 0.0 ... -0.142855 0.0 0.0 0.0 0.0
1 0.0 ... -0.142855 0.0 0.0 0.0 0.0
2 0.0 ... -0.142855 0.0 0.0 0.0 0.0
3 0.0 ... -0.142855 0.0 0.0 0.0 0.0
4 0.0 ... -0.142855 0.0 0.0 0.0 0.0
text_1495 text_1496 text_1497 text_1498 text_1499
0 -0.142733 -0.100504 0.0 0.0 0.0
1 -0.142733 -0.100504 0.0 0.0 0.0
2 -0.142733 -0.100504 0.0 0.0 0.0
3 -0.142733 -0.100504 0.0 0.0 0.0
4 -0.142733 -0.100504 0.0 0.0 0.0
[5 rows x 3000 columns]
Applying UMAP for dim reduction...
[I 2024-09-20 16:30:24,130] A new study created in memory with name: no-name-36a03b0d-863a-4fa2-a02e-19e458c477ae
Performing K-Means Clustering...
[I 2024-09-20 16:30:29,152] Trial 0 finished with value: 0.3791390061378479 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 0 with value: 0.3791390061378479.
[I 2024-09-20 16:30:34,447] Trial 1 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:39,434] Trial 2 finished with value: 0.37424296140670776 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:44,536] Trial 3 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 1 with value: 0.7161176204681396.
[I 2024-09-20 16:30:49,831] Trial 4 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:30:54,927] Trial 5 finished with value: 0.4103359282016754 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:00,242] Trial 6 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:05,516] Trial 7 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:10,792] Trial 8 finished with value: 0.7161176204681396 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:15,863] Trial 9 finished with value: 0.3678858280181885 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:21,412] Trial 10 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:26,834] Trial 11 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:32,286] Trial 12 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:37,423] Trial 13 finished with value: 0.35118553042411804 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
[I 2024-09-20 16:31:42,827] Trial 14 finished with value: 0.718830943107605 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 4 with value: 0.718830943107605.
Best params: {'n_clusters': 4, 'init': 'random'}
The Silhouette score is 0.718830943107605
The Callinski index is 10019.619140625
Dataset with Labels from KMeans in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 2
1 0.0 1.0000 2
2 0.0 0.6625 2
3 0.0 1.0000 2
4 1.0 1.0000 2
Records found in cluster 0 from KMeans in Exp 3
gender gender:confidence Cluster_Label
42 2.0 1.0000 0
62 1.0 1.0000 0
166 0.0 1.0000 0
173 2.0 1.0000 0
190 2.0 0.6780 0
... ... ... ...
18624 1.0 1.0000 0
18654 0.0 1.0000 0
18656 1.0 1.0000 0
18673 0.0 1.0000 0
18722 1.0 0.3371 0
[865 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 307
No. of records with gender 1 in cluster 0 is 272
No. of records with gender 2 in cluster 0 is 286
Records found in cluster 1 from KMeans in Exp 3
gender gender:confidence Cluster_Label
113 1.0 1.0000 1
230 1.0 0.6755 1
502 0.0 1.0000 1
578 1.0 1.0000 1
644 0.0 1.0000 1
... ... ... ...
17448 0.0 1.0000 1
18208 0.0 1.0000 1
18679 1.0 1.0000 1
18753 0.0 0.6678 1
18824 2.0 1.0000 1
[469 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 122
No. of records with gender 1 in cluster 1 is 146
No. of records with gender 2 in cluster 1 is 201
Records found in cluster 2 from KMeans in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 2
1 0.0 1.0000 2
2 0.0 0.6625 2
3 0.0 1.0000 2
4 1.0 1.0000 2
... ... ... ...
18829 1.0 1.0000 2
18831 0.0 0.6466 2
18832 1.0 1.0000 2
18834 1.0 1.0000 2
18835 0.0 0.6772 2
[15810 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 5260
No. of records with gender 1 in cluster 2 is 5607
No. of records with gender 2 in cluster 2 is 4943
Records found in cluster 3 from KMeans in Exp 3
[I 2024-09-20 16:31:50,732] A new study created in memory with name: no-name-3c57886b-d112-4b57-9312-be650d49f11f
gender gender:confidence Cluster_Label 261 1.0 1.0 3 336 0.0 1.0 3 575 0.0 1.0 3 929 1.0 1.0 3 1172 0.0 1.0 3 ... ... ... ... 18510 1.0 1.0 3 18609 1.0 1.0 3 18731 1.0 1.0 3 18738 2.0 1.0 3 18764 1.0 1.0 3 [558 rows x 3 columns] No. of records with gender 0 in cluster 3 is 154 No. of records with gender 1 in cluster 3 is 176 No. of records with gender 2 in cluster 3 is 228 Performing DBSCAN Clustering...
[I 2024-09-20 16:31:56,283] Trial 0 finished with value: 0.07267794013023376 and parameters: {'eps': 0.224350065881816, 'min_samples': 12}. Best is trial 0 with value: 0.07267794013023376.
[I 2024-09-20 16:32:03,308] Trial 1 finished with value: 0.5760640501976013 and parameters: {'eps': 1.981346472478528, 'min_samples': 13}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:09,091] Trial 2 finished with value: 0.456826776266098 and parameters: {'eps': 0.5994244424252012, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:15,888] Trial 3 finished with value: 0.5654299259185791 and parameters: {'eps': 1.6306995770185833, 'min_samples': 12}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:21,989] Trial 4 finished with value: 0.40619421005249023 and parameters: {'eps': 0.9888580285943404, 'min_samples': 7}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:27,401] Trial 5 finished with value: 0.0672125294804573 and parameters: {'eps': 0.2212632309988171, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:33,226] Trial 6 finished with value: 0.3273886442184448 and parameters: {'eps': 0.8245746726811352, 'min_samples': 5}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:39,816] Trial 7 finished with value: 0.5654299259185791 and parameters: {'eps': 1.5980627491197157, 'min_samples': 11}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:46,496] Trial 8 finished with value: 0.5481722950935364 and parameters: {'eps': 1.6967788919940698, 'min_samples': 10}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:32:53,073] Trial 9 finished with value: 0.45212018489837646 and parameters: {'eps': 1.5229068478428347, 'min_samples': 9}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:00,076] Trial 10 finished with value: 0.5555833578109741 and parameters: {'eps': 1.9667767292648453, 'min_samples': 20}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:06,982] Trial 11 finished with value: 0.5760640501976013 and parameters: {'eps': 1.9662951357354617, 'min_samples': 14}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:13,834] Trial 12 finished with value: 0.5699435472488403 and parameters: {'eps': 1.964896277269623, 'min_samples': 16}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:20,152] Trial 13 finished with value: 0.563732922077179 and parameters: {'eps': 1.3014195484895565, 'min_samples': 15}. Best is trial 1 with value: 0.5760640501976013.
[I 2024-09-20 16:33:26,409] Trial 14 finished with value: 0.5307610034942627 and parameters: {'eps': 1.271279456944655, 'min_samples': 18}. Best is trial 1 with value: 0.5760640501976013.
Found best params: {'eps': 1.981346472478528, 'min_samples': 13}
The Silhouette score is 0.5760640501976013
The Callinski index is 1357.848876953125
Dataset with Labels from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
Records found in cluster 0 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
0 0.0 1.0000 0
1 0.0 1.0000 0
2 0.0 0.6625 0
3 0.0 1.0000 0
4 1.0 1.0000 0
... ... ... ...
18829 1.0 1.0000 0
18831 0.0 0.6466 0
18832 1.0 1.0000 0
18834 1.0 1.0000 0
18835 0.0 0.6772 0
[15963 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5340
No. of records with gender 1 in cluster 0 is 5665
No. of records with gender 2 in cluster 0 is 4958
Records found in cluster 1 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
42 2.0 1.000 1
190 2.0 0.678 1
211 2.0 1.000 1
252 2.0 1.000 1
255 1.0 1.000 1
... ... ... ...
18546 1.0 1.000 1
18573 0.0 1.000 1
18584 1.0 1.000 1
18624 1.0 1.000 1
18656 1.0 1.000 1
[148 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 44
No. of records with gender 1 in cluster 1 is 52
No. of records with gender 2 in cluster 1 is 52
Records found in cluster 2 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
113 1.0 1.0000 2
6301 2.0 1.0000 2
6302 2.0 1.0000 2
6309 1.0 0.3750 2
6311 2.0 1.0000 2
6318 1.0 1.0000 2
6319 0.0 0.6471 2
6327 2.0 0.6733 2
6332 0.0 1.0000 2
6358 2.0 0.6692 2
6366 2.0 0.6662 2
6373 2.0 1.0000 2
6374 2.0 1.0000 2
6378 0.0 1.0000 2
6381 1.0 1.0000 2
6383 2.0 0.6754 2
6389 0.0 1.0000 2
6390 1.0 1.0000 2
6391 1.0 1.0000 2
6393 2.0 1.0000 2
6397 1.0 1.0000 2
6398 2.0 1.0000 2
6399 1.0 1.0000 2
8850 0.0 1.0000 2
11402 0.0 1.0000 2
12450 1.0 1.0000 2
13813 0.0 1.0000 2
No. of records with gender 0 in cluster 2 is 7
No. of records with gender 1 in cluster 2 is 9
No. of records with gender 2 in cluster 2 is 11
Records found in cluster 3 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
230 1.0 0.6755 3
7500 1.0 1.0000 3
7502 1.0 0.6617 3
7505 0.0 1.0000 3
7507 0.0 0.6848 3
7508 0.0 1.0000 3
7509 1.0 1.0000 3
7510 0.0 1.0000 3
7511 2.0 1.0000 3
7512 1.0 0.6739 3
7513 0.0 1.0000 3
7524 1.0 1.0000 3
7531 2.0 1.0000 3
7532 2.0 1.0000 3
7534 2.0 1.0000 3
7581 1.0 1.0000 3
7586 0.0 1.0000 3
7593 2.0 1.0000 3
7596 0.0 1.0000 3
7598 2.0 1.0000 3
12002 0.0 1.0000 3
No. of records with gender 0 in cluster 3 is 8
No. of records with gender 1 in cluster 3 is 7
No. of records with gender 2 in cluster 3 is 6
Records found in cluster 4 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
261 1.0 1.0000 4
336 0.0 1.0000 4
929 1.0 1.0000 4
1172 0.0 1.0000 4
1455 1.0 0.6678 4
1686 0.0 1.0000 4
3378 2.0 0.6688 4
3521 2.0 1.0000 4
3544 2.0 1.0000 4
5605 1.0 1.0000 4
5611 2.0 0.6856 4
5616 2.0 1.0000 4
5625 0.0 1.0000 4
5626 2.0 0.6589 4
5632 2.0 0.6651 4
5643 0.0 1.0000 4
5644 1.0 0.6725 4
5661 2.0 1.0000 4
5665 1.0 1.0000 4
5669 2.0 1.0000 4
5670 1.0 0.6752 4
5671 2.0 0.3424 4
5672 2.0 1.0000 4
5673 0.0 0.6761 4
5674 1.0 1.0000 4
5675 2.0 1.0000 4
5679 2.0 0.6816 4
5681 1.0 1.0000 4
5683 2.0 1.0000 4
5685 0.0 1.0000 4
5686 2.0 1.0000 4
5687 2.0 0.6799 4
5689 2.0 0.6805 4
5696 0.0 1.0000 4
5697 0.0 0.6892 4
9387 2.0 1.0000 4
10074 1.0 0.6741 4
10109 1.0 1.0000 4
10453 2.0 1.0000 4
10729 2.0 1.0000 4
10792 0.0 1.0000 4
10928 2.0 0.6605 4
11125 0.0 1.0000 4
11755 1.0 1.0000 4
11820 1.0 1.0000 4
13497 2.0 1.0000 4
14339 1.0 1.0000 4
14581 1.0 1.0000 4
14835 1.0 1.0000 4
15275 1.0 1.0000 4
15841 1.0 1.0000 4
15900 1.0 1.0000 4
15985 2.0 1.0000 4
16529 1.0 1.0000 4
17107 0.0 0.6597 4
17846 1.0 1.0000 4
18764 1.0 1.0000 4
No. of records with gender 0 in cluster 4 is 12
No. of records with gender 1 in cluster 4 is 22
No. of records with gender 2 in cluster 4 is 23
Records found in cluster 5 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
264 0.0 1.0000 5
2740 0.0 1.0000 5
4012 1.0 1.0000 5
4097 0.0 0.6706 5
4100 2.0 1.0000 5
... ... ... ...
16986 0.0 1.0000 5
17182 0.0 1.0000 5
18083 0.0 1.0000 5
18789 0.0 1.0000 5
18803 1.0 1.0000 5
[131 rows x 3 columns]
No. of records with gender 0 in cluster 5 is 48
No. of records with gender 1 in cluster 5 is 33
No. of records with gender 2 in cluster 5 is 50
Records found in cluster 6 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
348 1.0 1.0000 6
3814 0.0 1.0000 6
8923 2.0 1.0000 6
8925 0.0 1.0000 6
8927 1.0 1.0000 6
8930 2.0 1.0000 6
8940 2.0 0.6815 6
8943 2.0 1.0000 6
8944 2.0 0.6641 6
8945 0.0 1.0000 6
8947 2.0 1.0000 6
8948 2.0 1.0000 6
8951 0.0 0.6752 6
8952 1.0 0.6734 6
8953 1.0 1.0000 6
8954 2.0 1.0000 6
8965 2.0 1.0000 6
8971 1.0 1.0000 6
8981 1.0 1.0000 6
8987 2.0 1.0000 6
8988 0.0 1.0000 6
8989 1.0 1.0000 6
8990 2.0 1.0000 6
8991 2.0 0.6728 6
8995 2.0 0.6761 6
8997 0.0 1.0000 6
15702 1.0 0.6739 6
16019 0.0 1.0000 6
16293 0.0 1.0000 6
16469 2.0 0.6755 6
No. of records with gender 0 in cluster 6 is 8
No. of records with gender 1 in cluster 6 is 8
No. of records with gender 2 in cluster 6 is 14
Records found in cluster 7 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
431 0.0 0.6631 7
4374 2.0 1.0000 7
4456 1.0 1.0000 7
4653 2.0 1.0000 7
4995 2.0 1.0000 7
5220 2.0 0.6650 7
5372 2.0 1.0000 7
5749 2.0 1.0000 7
6043 2.0 0.6787 7
6172 2.0 1.0000 7
6208 1.0 0.6543 7
6496 2.0 0.6716 7
6669 0.0 1.0000 7
7060 1.0 0.6890 7
7261 0.0 1.0000 7
7439 0.0 1.0000 7
7683 1.0 0.6699 7
7902 0.0 1.0000 7
8120 1.0 1.0000 7
8360 2.0 0.6854 7
8408 0.0 1.0000 7
9100 0.0 1.0000 7
9333 1.0 1.0000 7
10448 2.0 0.6544 7
10820 0.0 0.6635 7
12961 1.0 1.0000 7
13252 1.0 1.0000 7
13603 1.0 1.0000 7
14102 0.0 1.0000 7
14844 0.0 1.0000 7
15017 1.0 1.0000 7
No. of records with gender 0 in cluster 7 is 10
No. of records with gender 1 in cluster 7 is 10
No. of records with gender 2 in cluster 7 is 11
Records found in cluster 8 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
502 0.0 1.0000 8
578 1.0 1.0000 8
644 0.0 1.0000 8
771 0.0 1.0000 8
963 2.0 1.0000 8
1433 1.0 1.0000 8
1881 0.0 0.6691 8
2762 2.0 0.6670 8
2903 1.0 0.6763 8
3308 0.0 0.3364 8
3353 0.0 1.0000 8
3681 2.0 1.0000 8
3830 0.0 1.0000 8
4305 1.0 1.0000 8
5040 0.0 1.0000 8
5479 0.0 0.6857 8
5742 0.0 1.0000 8
6460 2.0 1.0000 8
6862 1.0 1.0000 8
8397 2.0 0.6634 8
8516 2.0 0.6839 8
8918 2.0 1.0000 8
No. of records with gender 0 in cluster 8 is 10
No. of records with gender 1 in cluster 8 is 5
No. of records with gender 2 in cluster 8 is 7
Records found in cluster 9 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
575 0.0 1.0000 9
1308 0.0 0.6479 9
2033 1.0 1.0000 9
2308 1.0 0.6774 9
3898 0.0 1.0000 9
5454 2.0 0.6774 9
5539 1.0 1.0000 9
5628 2.0 1.0000 9
5825 1.0 1.0000 9
5847 2.0 0.6717 9
6012 0.0 1.0000 9
6048 2.0 0.6796 9
6114 1.0 0.6620 9
6335 2.0 1.0000 9
6382 2.0 0.6842 9
6417 2.0 1.0000 9
7843 2.0 1.0000 9
8181 0.0 1.0000 9
8355 2.0 0.6778 9
8738 0.0 1.0000 9
No. of records with gender 0 in cluster 9 is 6
No. of records with gender 1 in cluster 9 is 5
No. of records with gender 2 in cluster 9 is 9
Records found in cluster 10 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
625 1.0 1.0000 10
7101 2.0 1.0000 10
7102 0.0 1.0000 10
7105 0.0 1.0000 10
7109 1.0 1.0000 10
7113 2.0 0.6718 10
7115 0.0 0.3451 10
7123 0.0 1.0000 10
7128 2.0 0.6585 10
7130 2.0 1.0000 10
7136 1.0 0.6835 10
7148 0.0 0.6750 10
7153 1.0 1.0000 10
7158 1.0 1.0000 10
7162 1.0 1.0000 10
7166 2.0 0.6635 10
7176 1.0 1.0000 10
7184 2.0 1.0000 10
No. of records with gender 0 in cluster 10 is 5
No. of records with gender 1 in cluster 10 is 7
No. of records with gender 2 in cluster 10 is 6
Records found in cluster 11 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
780 1.0 1.0000 11
2335 0.0 1.0000 11
4874 0.0 1.0000 11
5800 1.0 1.0000 11
5807 1.0 1.0000 11
5809 0.0 1.0000 11
5810 1.0 1.0000 11
5819 2.0 0.6667 11
5835 2.0 1.0000 11
5838 2.0 1.0000 11
5841 2.0 0.6645 11
5843 0.0 0.6658 11
5846 2.0 1.0000 11
5849 0.0 0.6792 11
5861 2.0 0.6808 11
5862 0.0 1.0000 11
5868 1.0 1.0000 11
5869 1.0 1.0000 11
5870 0.0 0.3441 11
5877 1.0 1.0000 11
5881 2.0 1.0000 11
5883 2.0 0.6725 11
5885 2.0 0.6640 11
5894 1.0 1.0000 11
5898 2.0 0.6675 11
8449 2.0 1.0000 11
10879 0.0 1.0000 11
18679 1.0 1.0000 11
No. of records with gender 0 in cluster 11 is 8
No. of records with gender 1 in cluster 11 is 9
No. of records with gender 2 in cluster 11 is 11
Records found in cluster 12 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
792 2.0 1.0000 12
7804 1.0 1.0000 12
7810 2.0 1.0000 12
7811 2.0 0.6341 12
7817 0.0 1.0000 12
7819 0.0 1.0000 12
7820 0.0 1.0000 12
7821 2.0 1.0000 12
7822 2.0 1.0000 12
7824 2.0 1.0000 12
7825 0.0 1.0000 12
7827 2.0 0.3472 12
7830 1.0 1.0000 12
7882 0.0 1.0000 12
7888 2.0 0.6506 12
7890 2.0 1.0000 12
7892 2.0 1.0000 12
7897 0.0 0.6803 12
7899 1.0 1.0000 12
8203 2.0 1.0000 12
8204 2.0 0.6746 12
8208 2.0 0.6844 12
8236 0.0 1.0000 12
8246 2.0 0.6598 12
8247 1.0 1.0000 12
8250 1.0 1.0000 12
8251 0.0 0.6624 12
8261 1.0 1.0000 12
8264 0.0 1.0000 12
8269 0.0 0.6774 12
8272 2.0 1.0000 12
8284 2.0 0.6691 12
8488 1.0 1.0000 12
13379 0.0 1.0000 12
No. of records with gender 0 in cluster 12 is 11
No. of records with gender 1 in cluster 12 is 7
No. of records with gender 2 in cluster 12 is 16
Records found in cluster 13 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1203 1.0 1.0000 13
1240 1.0 0.6889 13
2115 0.0 1.0000 13
2381 0.0 1.0000 13
3988 2.0 1.0000 13
5994 2.0 0.6611 13
7988 1.0 0.6734 13
8071 1.0 1.0000 13
10735 0.0 1.0000 13
10738 0.0 1.0000 13
11076 2.0 1.0000 13
11179 2.0 1.0000 13
11484 1.0 1.0000 13
11648 1.0 1.0000 13
11746 0.0 1.0000 13
12054 1.0 1.0000 13
13078 0.0 1.0000 13
14056 2.0 1.0000 13
15064 0.0 0.6534 13
15751 1.0 1.0000 13
15757 1.0 1.0000 13
16465 0.0 1.0000 13
16868 1.0 1.0000 13
17448 0.0 1.0000 13
18208 0.0 1.0000 13
18753 0.0 0.6678 13
No. of records with gender 0 in cluster 13 is 11
No. of records with gender 1 in cluster 13 is 10
No. of records with gender 2 in cluster 13 is 5
Records found in cluster 14 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1273 0.0 1.0000 14
1605 2.0 1.0000 14
1761 2.0 1.0000 14
1845 1.0 1.0000 14
1987 1.0 1.0000 14
2274 0.0 1.0000 14
3961 0.0 1.0000 14
4092 0.0 0.3411 14
4424 2.0 1.0000 14
5218 2.0 1.0000 14
5336 1.0 1.0000 14
5445 0.0 1.0000 14
6262 2.0 1.0000 14
6289 1.0 1.0000 14
7003 1.0 1.0000 14
7118 2.0 1.0000 14
7431 1.0 1.0000 14
7540 0.0 0.6859 14
7791 1.0 1.0000 14
8142 2.0 1.0000 14
8601 2.0 0.6700 14
8693 0.0 1.0000 14
9023 1.0 0.6654 14
9265 1.0 1.0000 14
No. of records with gender 0 in cluster 14 is 7
No. of records with gender 1 in cluster 14 is 9
No. of records with gender 2 in cluster 14 is 8
Records found in cluster 15 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1474 1.0 0.3390 15
1582 1.0 1.0000 15
1940 2.0 0.6675 15
3133 0.0 1.0000 15
3252 0.0 1.0000 15
... ... ... ...
14750 1.0 1.0000 15
15816 1.0 1.0000 15
17319 1.0 1.0000 15
17504 0.0 0.6567 15
18609 1.0 1.0000 15
[103 rows x 3 columns]
No. of records with gender 0 in cluster 15 is 34
No. of records with gender 1 in cluster 15 is 31
No. of records with gender 2 in cluster 15 is 38
Records found in cluster 16 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1646 0.0 0.6576 16
1868 0.0 1.0000 16
3803 0.0 1.0000 16
3962 2.0 1.0000 16
5400 0.0 1.0000 16
5401 2.0 0.6836 16
5407 2.0 0.6785 16
5408 2.0 1.0000 16
5409 0.0 1.0000 16
5412 2.0 1.0000 16
5427 1.0 1.0000 16
5429 0.0 1.0000 16
5433 2.0 0.6736 16
5434 1.0 1.0000 16
5436 2.0 0.6602 16
5442 1.0 0.3409 16
5443 2.0 0.6483 16
5447 1.0 1.0000 16
5448 2.0 0.6654 16
5449 1.0 1.0000 16
5456 0.0 1.0000 16
5457 2.0 0.6468 16
5466 2.0 1.0000 16
5470 2.0 1.0000 16
5471 0.0 1.0000 16
5472 0.0 1.0000 16
5480 1.0 1.0000 16
5485 2.0 1.0000 16
5486 1.0 1.0000 16
5487 1.0 0.6669 16
5490 2.0 1.0000 16
5491 2.0 1.0000 16
7364 1.0 1.0000 16
9547 2.0 1.0000 16
17851 2.0 0.6495 16
No. of records with gender 0 in cluster 16 is 9
No. of records with gender 1 in cluster 16 is 9
No. of records with gender 2 in cluster 16 is 17
Records found in cluster 17 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
1673 1.0 1.0000 17
2702 1.0 1.0000 17
7600 0.0 1.0000 17
7601 2.0 0.6609 17
7611 0.0 0.6666 17
7613 2.0 1.0000 17
7614 2.0 0.6866 17
7615 2.0 1.0000 17
7620 1.0 0.6549 17
7621 1.0 1.0000 17
7622 2.0 1.0000 17
7626 0.0 1.0000 17
7627 0.0 0.7037 17
7629 2.0 1.0000 17
7652 0.0 0.6772 17
7655 1.0 1.0000 17
7662 0.0 1.0000 17
7665 2.0 0.6832 17
7667 0.0 1.0000 17
7669 2.0 1.0000 17
7670 1.0 1.0000 17
7672 2.0 1.0000 17
7679 1.0 1.0000 17
7680 1.0 1.0000 17
7681 1.0 1.0000 17
7686 2.0 1.0000 17
7694 2.0 1.0000 17
7697 1.0 1.0000 17
16509 1.0 1.0000 17
No. of records with gender 0 in cluster 17 is 7
No. of records with gender 1 in cluster 17 is 11
No. of records with gender 2 in cluster 17 is 11
Records found in cluster 18 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2046 0.0 0.6531 18
3257 2.0 1.0000 18
7002 2.0 1.0000 18
7016 0.0 1.0000 18
7017 2.0 0.6646 18
7033 1.0 1.0000 18
7040 1.0 1.0000 18
7043 0.0 1.0000 18
7048 2.0 1.0000 18
7052 2.0 0.6595 18
7053 2.0 1.0000 18
7058 1.0 1.0000 18
7062 0.0 1.0000 18
7065 2.0 1.0000 18
7087 2.0 0.6671 18
7091 1.0 0.6642 18
7095 2.0 1.0000 18
7096 2.0 0.6782 18
7097 2.0 0.6788 18
8775 1.0 0.6609 18
No. of records with gender 0 in cluster 18 is 4
No. of records with gender 1 in cluster 18 is 5
No. of records with gender 2 in cluster 18 is 11
Records found in cluster 19 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2135 2.0 1.0000 19
3978 1.0 1.0000 19
5034 0.0 1.0000 19
5208 0.0 1.0000 19
5364 2.0 1.0000 19
5513 0.0 1.0000 19
5677 1.0 1.0000 19
5817 0.0 1.0000 19
5929 1.0 1.0000 19
6085 0.0 1.0000 19
6257 2.0 0.6874 19
6679 1.0 1.0000 19
6819 2.0 0.6537 19
7029 0.0 1.0000 19
7121 0.0 1.0000 19
9044 1.0 1.0000 19
No. of records with gender 0 in cluster 19 is 7
No. of records with gender 1 in cluster 19 is 5
No. of records with gender 2 in cluster 19 is 4
Records found in cluster 20 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2138 1.0 1.0000 20
2145 0.0 1.0000 20
2146 1.0 1.0000 20
2147 1.0 1.0000 20
2148 1.0 0.3576 20
2156 0.0 1.0000 20
2166 1.0 1.0000 20
2168 0.0 0.6825 20
2169 1.0 1.0000 20
2171 1.0 1.0000 20
2172 0.0 1.0000 20
2182 2.0 1.0000 20
2185 0.0 1.0000 20
2186 0.0 0.3403 20
2187 1.0 1.0000 20
2188 2.0 0.6812 20
2189 0.0 0.6582 20
2191 0.0 1.0000 20
2194 1.0 1.0000 20
2196 1.0 1.0000 20
2204 1.0 0.6587 20
2205 0.0 0.6685 20
2206 1.0 0.6551 20
2207 1.0 1.0000 20
2210 1.0 1.0000 20
2216 1.0 0.6896 20
2217 1.0 0.6832 20
2220 1.0 1.0000 20
2223 2.0 1.0000 20
14626 0.0 1.0000 20
No. of records with gender 0 in cluster 20 is 10
No. of records with gender 1 in cluster 20 is 17
No. of records with gender 2 in cluster 20 is 3
Records found in cluster 21 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2240 0.0 1.0000 21
3269 1.0 0.3394 21
9001 1.0 1.0000 21
9020 0.0 1.0000 21
9028 1.0 0.6849 21
9033 0.0 1.0000 21
9038 1.0 0.6667 21
9043 1.0 1.0000 21
9046 2.0 0.6745 21
9050 1.0 0.6658 21
9052 2.0 0.6826 21
9054 1.0 1.0000 21
9055 1.0 1.0000 21
9056 2.0 1.0000 21
9061 0.0 1.0000 21
9064 2.0 1.0000 21
9069 2.0 0.6595 21
9070 0.0 1.0000 21
9072 1.0 0.6774 21
9076 2.0 1.0000 21
9079 0.0 1.0000 21
9080 1.0 0.6532 21
9081 0.0 1.0000 21
9082 0.0 1.0000 21
9083 0.0 1.0000 21
9089 1.0 1.0000 21
9952 2.0 0.3548 21
14813 1.0 0.6875 21
15564 0.0 1.0000 21
18157 1.0 1.0000 21
No. of records with gender 0 in cluster 21 is 10
No. of records with gender 1 in cluster 21 is 13
No. of records with gender 2 in cluster 21 is 7
Records found in cluster 22 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2512 1.0 1.0000 22
8502 2.0 1.0000 22
8505 1.0 1.0000 22
8506 2.0 1.0000 22
8507 0.0 1.0000 22
8520 2.0 0.6820 22
8525 0.0 1.0000 22
8528 1.0 1.0000 22
8531 2.0 0.6681 22
8535 2.0 1.0000 22
8540 2.0 1.0000 22
8541 2.0 1.0000 22
8542 2.0 1.0000 22
8546 1.0 1.0000 22
8553 2.0 1.0000 22
8554 0.0 1.0000 22
8557 0.0 1.0000 22
8562 0.0 1.0000 22
8563 1.0 1.0000 22
8564 2.0 1.0000 22
8565 0.0 0.6862 22
8568 1.0 1.0000 22
8580 2.0 1.0000 22
8583 0.0 1.0000 22
8586 0.0 0.6453 22
13204 1.0 1.0000 22
16912 1.0 0.6483 22
16945 0.0 1.0000 22
No. of records with gender 0 in cluster 22 is 9
No. of records with gender 1 in cluster 22 is 8
No. of records with gender 2 in cluster 22 is 11
Records found in cluster 23 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2730 1.0 1.0000 23
3086 0.0 1.0000 23
5506 2.0 0.6595 23
5511 1.0 1.0000 23
5524 0.0 0.6722 23
5541 0.0 1.0000 23
5542 2.0 1.0000 23
5544 1.0 0.3374 23
5546 1.0 1.0000 23
5552 2.0 1.0000 23
5558 2.0 1.0000 23
5559 2.0 0.6745 23
5560 1.0 1.0000 23
5561 0.0 1.0000 23
5563 2.0 1.0000 23
5564 2.0 1.0000 23
5566 1.0 0.6607 23
5570 2.0 1.0000 23
5572 1.0 1.0000 23
5579 1.0 1.0000 23
5583 2.0 1.0000 23
5588 0.0 0.6795 23
5597 0.0 1.0000 23
5598 0.0 1.0000 23
6067 1.0 1.0000 23
10803 0.0 1.0000 23
12037 0.0 1.0000 23
12202 0.0 1.0000 23
14307 2.0 0.6617 23
16093 0.0 0.3575 23
17031 1.0 1.0000 23
17498 1.0 1.0000 23
No. of records with gender 0 in cluster 23 is 11
No. of records with gender 1 in cluster 23 is 11
No. of records with gender 2 in cluster 23 is 10
Records found in cluster 24 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
2928 2.0 0.6734 24
7703 1.0 1.0000 24
7705 1.0 1.0000 24
7727 2.0 1.0000 24
7738 2.0 1.0000 24
7743 0.0 1.0000 24
7745 1.0 1.0000 24
7746 2.0 1.0000 24
7747 2.0 0.6745 24
7748 2.0 1.0000 24
7751 2.0 1.0000 24
7752 1.0 0.6649 24
7757 2.0 1.0000 24
7759 2.0 1.0000 24
7760 2.0 1.0000 24
7761 1.0 1.0000 24
7793 0.0 0.6691 24
7797 2.0 0.6600 24
10622 1.0 0.6692 24
No. of records with gender 0 in cluster 24 is 2
No. of records with gender 1 in cluster 24 is 6
No. of records with gender 2 in cluster 24 is 11
Records found in cluster 25 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
3581 0.0 1.0000 25
3705 2.0 0.6581 25
3809 2.0 1.0000 25
3906 1.0 0.6422 25
4041 0.0 1.0000 25
4156 1.0 1.0000 25
4272 2.0 1.0000 25
4341 0.0 1.0000 25
4410 2.0 1.0000 25
4508 1.0 1.0000 25
4631 2.0 1.0000 25
4736 2.0 1.0000 25
4840 2.0 1.0000 25
5305 1.0 1.0000 25
No. of records with gender 0 in cluster 25 is 3
No. of records with gender 1 in cluster 25 is 4
No. of records with gender 2 in cluster 25 is 7
Records found in cluster 26 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
3744 0.0 0.6440 26
3927 0.0 1.0000 26
3994 1.0 1.0000 26
4057 2.0 0.3516 26
4300 2.0 0.6736 26
4398 1.0 1.0000 26
4470 2.0 0.6602 26
4544 0.0 1.0000 26
4640 2.0 1.0000 26
4800 2.0 0.6575 26
4883 2.0 1.0000 26
5043 1.0 1.0000 26
5238 1.0 1.0000 26
5325 1.0 0.6645 26
5515 2.0 1.0000 26
5659 1.0 1.0000 26
5978 2.0 1.0000 26
6188 2.0 0.6748 26
6440 2.0 1.0000 26
6562 0.0 1.0000 26
6671 2.0 1.0000 26
6749 1.0 1.0000 26
6826 2.0 0.6933 26
7050 0.0 0.6736 26
No. of records with gender 0 in cluster 26 is 5
No. of records with gender 1 in cluster 26 is 7
No. of records with gender 2 in cluster 26 is 12
Records found in cluster 27 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
4093 2.0 1.0000 27
4485 2.0 1.0000 27
4893 2.0 1.0000 27
6095 2.0 0.6602 27
6412 2.0 1.0000 27
7079 2.0 1.0000 27
8501 0.0 1.0000 27
8968 2.0 1.0000 27
9965 0.0 1.0000 27
10058 2.0 1.0000 27
10070 1.0 1.0000 27
10084 0.0 1.0000 27
10092 1.0 1.0000 27
10102 2.0 1.0000 27
10116 2.0 1.0000 27
10131 0.0 1.0000 27
10143 2.0 1.0000 27
10167 1.0 0.3495 27
10256 0.0 1.0000 27
10658 1.0 1.0000 27
11280 0.0 1.0000 27
14155 0.0 1.0000 27
14888 1.0 1.0000 27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 5
No. of records with gender 2 in cluster 27 is 11
Records found in cluster 28 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
4606 0.0 1.0000 28
4608 0.0 0.6618 28
4615 2.0 0.6590 28
4621 1.0 1.0000 28
4627 2.0 1.0000 28
4643 0.0 1.0000 28
4657 2.0 0.6751 28
4664 1.0 1.0000 28
4674 2.0 1.0000 28
4675 2.0 1.0000 28
4685 2.0 1.0000 28
4690 0.0 0.6763 28
4691 0.0 1.0000 28
4710 2.0 1.0000 28
4712 0.0 1.0000 28
4717 2.0 1.0000 28
4720 2.0 1.0000 28
4722 2.0 0.6686 28
4731 1.0 1.0000 28
4743 2.0 1.0000 28
4746 1.0 1.0000 28
4772 2.0 1.0000 28
4778 1.0 0.3592 28
4780 2.0 1.0000 28
4781 2.0 0.6475 28
4782 1.0 0.6697 28
4783 2.0 1.0000 28
4785 2.0 0.6811 28
4789 2.0 1.0000 28
4790 1.0 1.0000 28
4798 2.0 0.6736 28
4799 0.0 1.0000 28
6627 2.0 1.0000 28
6629 1.0 1.0000 28
6633 0.0 1.0000 28
6650 2.0 1.0000 28
6654 1.0 1.0000 28
6660 2.0 1.0000 28
6664 2.0 1.0000 28
6665 1.0 1.0000 28
6668 0.0 1.0000 28
6670 0.0 1.0000 28
6678 1.0 1.0000 28
6685 2.0 1.0000 28
6688 2.0 1.0000 28
11370 2.0 1.0000 28
No. of records with gender 0 in cluster 28 is 10
No. of records with gender 1 in cluster 28 is 11
No. of records with gender 2 in cluster 28 is 25
Records found in cluster 29 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
4906 2.0 0.6681 29
4908 0.0 1.0000 29
4909 2.0 1.0000 29
4910 0.0 1.0000 29
4912 1.0 1.0000 29
4917 1.0 0.6571 29
4918 0.0 1.0000 29
4923 2.0 1.0000 29
4924 2.0 0.6585 29
4929 1.0 1.0000 29
4934 1.0 0.6571 29
4937 2.0 1.0000 29
4944 1.0 0.6711 29
4949 2.0 1.0000 29
4950 1.0 1.0000 29
4951 1.0 1.0000 29
4961 0.0 1.0000 29
4962 1.0 1.0000 29
4965 2.0 0.6695 29
4967 0.0 1.0000 29
4968 1.0 1.0000 29
4970 0.0 1.0000 29
4973 1.0 1.0000 29
4990 1.0 1.0000 29
4997 2.0 0.6957 29
4999 2.0 0.6884 29
13476 2.0 0.6742 29
16183 0.0 1.0000 29
18336 1.0 1.0000 29
No. of records with gender 0 in cluster 29 is 7
No. of records with gender 1 in cluster 29 is 12
No. of records with gender 2 in cluster 29 is 10
Records found in cluster 30 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5002 1.0 1.0000 30
5007 0.0 1.0000 30
5014 1.0 1.0000 30
5017 0.0 1.0000 30
5021 1.0 1.0000 30
5030 0.0 1.0000 30
5049 1.0 0.6787 30
5065 1.0 1.0000 30
5069 2.0 0.6832 30
5075 2.0 0.6692 30
5084 2.0 1.0000 30
5086 0.0 1.0000 30
5088 2.0 1.0000 30
5090 2.0 1.0000 30
5094 1.0 1.0000 30
5095 2.0 0.6848 30
11521 2.0 0.6792 30
17014 2.0 1.0000 30
No. of records with gender 0 in cluster 30 is 4
No. of records with gender 1 in cluster 30 is 6
No. of records with gender 2 in cluster 30 is 8
Records found in cluster 31 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5100 0.0 1.0000 31
5120 2.0 1.0000 31
5123 2.0 0.6741 31
5136 0.0 1.0000 31
5149 2.0 1.0000 31
5153 2.0 0.6735 31
5156 2.0 0.6516 31
5161 0.0 1.0000 31
5170 2.0 0.6606 31
5175 2.0 1.0000 31
5176 1.0 1.0000 31
5180 1.0 1.0000 31
5181 2.0 1.0000 31
5182 0.0 0.6801 31
5185 2.0 0.6822 31
5187 0.0 1.0000 31
5192 2.0 0.6835 31
No. of records with gender 0 in cluster 31 is 5
No. of records with gender 1 in cluster 31 is 2
No. of records with gender 2 in cluster 31 is 10
Records found in cluster 32 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5200 0.0 1.0000 32
5203 1.0 1.0000 32
5205 1.0 0.6748 32
5209 1.0 1.0000 32
5211 0.0 0.6738 32
5217 0.0 1.0000 32
5227 1.0 1.0000 32
5232 1.0 1.0000 32
5234 1.0 1.0000 32
5242 1.0 1.0000 32
5256 2.0 0.6475 32
5262 0.0 0.6457 32
5264 0.0 1.0000 32
5265 1.0 1.0000 32
5266 0.0 1.0000 32
5270 2.0 1.0000 32
5271 2.0 0.6812 32
5272 2.0 1.0000 32
5284 1.0 0.6815 32
5289 0.0 1.0000 32
5291 2.0 0.6333 32
5297 0.0 1.0000 32
10620 1.0 1.0000 32
13921 2.0 0.6771 32
18824 2.0 1.0000 32
No. of records with gender 0 in cluster 32 is 8
No. of records with gender 1 in cluster 32 is 10
No. of records with gender 2 in cluster 32 is 7
Records found in cluster 33 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5705 1.0 1.0000 33
5709 2.0 0.6860 33
5711 2.0 1.0000 33
5712 1.0 1.0000 33
5726 2.0 0.6735 33
5746 2.0 0.3410 33
5752 2.0 0.6747 33
5754 1.0 1.0000 33
5757 1.0 1.0000 33
5766 2.0 1.0000 33
5767 2.0 1.0000 33
5768 1.0 0.3631 33
5770 2.0 1.0000 33
5773 2.0 0.6769 33
5777 2.0 0.6638 33
5782 1.0 1.0000 33
5786 2.0 1.0000 33
5790 0.0 1.0000 33
5792 2.0 1.0000 33
5793 2.0 0.6675 33
5794 2.0 1.0000 33
5798 2.0 1.0000 33
10582 2.0 0.6383 33
11935 2.0 1.0000 33
15021 0.0 1.0000 33
16688 0.0 1.0000 33
No. of records with gender 0 in cluster 33 is 3
No. of records with gender 1 in cluster 33 is 6
No. of records with gender 2 in cluster 33 is 17
Records found in cluster 34 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
5901 2.0 1.0000 34
5902 0.0 0.6462 34
5904 0.0 1.0000 34
5910 0.0 0.6787 34
5914 2.0 1.0000 34
5930 0.0 0.6512 34
5932 0.0 1.0000 34
5934 2.0 1.0000 34
5935 2.0 1.0000 34
5936 2.0 0.6836 34
5945 2.0 1.0000 34
5952 0.0 1.0000 34
5954 0.0 1.0000 34
5956 2.0 1.0000 34
5961 2.0 1.0000 34
5962 1.0 1.0000 34
5963 0.0 1.0000 34
5964 1.0 1.0000 34
5965 2.0 0.6764 34
5966 2.0 0.6842 34
5973 2.0 0.6509 34
5986 0.0 1.0000 34
5989 2.0 1.0000 34
5990 0.0 0.6713 34
16757 1.0 1.0000 34
No. of records with gender 0 in cluster 34 is 10
No. of records with gender 1 in cluster 34 is 3
No. of records with gender 2 in cluster 34 is 12
Records found in cluster 35 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6101 1.0 0.6543 35
6102 0.0 0.6699 35
6103 0.0 1.0000 35
6109 0.0 1.0000 35
6129 2.0 0.6778 35
6131 0.0 1.0000 35
6133 0.0 0.6655 35
6134 0.0 1.0000 35
6147 2.0 0.6540 35
6149 0.0 1.0000 35
6151 2.0 0.6642 35
6156 2.0 1.0000 35
6158 1.0 1.0000 35
6164 1.0 1.0000 35
6167 2.0 0.6742 35
6169 2.0 0.6866 35
6178 1.0 1.0000 35
6180 1.0 1.0000 35
6190 0.0 1.0000 35
6192 2.0 0.6652 35
6197 1.0 0.6513 35
No. of records with gender 0 in cluster 35 is 8
No. of records with gender 1 in cluster 35 is 6
No. of records with gender 2 in cluster 35 is 7
Records found in cluster 36 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6502 0.0 1.0000 36
6505 2.0 1.0000 36
6516 0.0 1.0000 36
6521 2.0 1.0000 36
6523 1.0 1.0000 36
6540 2.0 1.0000 36
6549 2.0 1.0000 36
6555 2.0 1.0000 36
6556 1.0 1.0000 36
6559 0.0 1.0000 36
6560 0.0 1.0000 36
6565 2.0 0.6534 36
6567 2.0 1.0000 36
6569 2.0 1.0000 36
6575 1.0 1.0000 36
6576 2.0 1.0000 36
6577 2.0 1.0000 36
6579 2.0 0.6762 36
6580 2.0 1.0000 36
6581 1.0 1.0000 36
6583 2.0 1.0000 36
6596 1.0 1.0000 36
6599 2.0 1.0000 36
12899 2.0 1.0000 36
No. of records with gender 0 in cluster 36 is 4
No. of records with gender 1 in cluster 36 is 5
No. of records with gender 2 in cluster 36 is 15
Records found in cluster 37 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
6722 1.0 1.0000 37
6726 0.0 1.0000 37
6728 2.0 0.6634 37
6730 2.0 0.6681 37
6732 1.0 0.6882 37
6742 2.0 0.6625 37
6758 0.0 0.3469 37
6759 1.0 0.6543 37
6772 2.0 1.0000 37
6786 2.0 0.6694 37
6787 2.0 1.0000 37
6788 2.0 1.0000 37
6789 2.0 1.0000 37
6793 1.0 0.6699 37
6795 2.0 0.6741 37
No. of records with gender 0 in cluster 37 is 2
No. of records with gender 1 in cluster 37 is 4
No. of records with gender 2 in cluster 37 is 9
Records found in cluster 38 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7210 0.0 0.6617 38
7215 2.0 1.0000 38
7216 2.0 0.6921 38
7228 2.0 0.6766 38
7230 1.0 1.0000 38
7234 0.0 1.0000 38
7250 2.0 1.0000 38
7258 1.0 0.6902 38
7259 0.0 1.0000 38
7260 2.0 1.0000 38
7266 2.0 1.0000 38
7273 1.0 1.0000 38
7277 0.0 0.3487 38
7284 0.0 0.6661 38
7288 2.0 1.0000 38
7297 2.0 0.6853 38
No. of records with gender 0 in cluster 38 is 5
No. of records with gender 1 in cluster 38 is 3
No. of records with gender 2 in cluster 38 is 8
Records found in cluster 39 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7289 0.0 1.0000 39
12796 1.0 1.0000 39
13303 1.0 1.0000 39
13417 1.0 1.0000 39
13502 1.0 1.0000 39
13716 1.0 0.6830 39
13901 2.0 0.6611 39
14140 0.0 0.6645 39
14214 2.0 1.0000 39
14269 2.0 0.6868 39
14337 1.0 1.0000 39
14412 1.0 1.0000 39
14483 0.0 1.0000 39
14645 1.0 1.0000 39
15443 2.0 1.0000 39
15534 0.0 1.0000 39
15807 0.0 1.0000 39
15916 1.0 1.0000 39
16188 1.0 1.0000 39
16418 2.0 1.0000 39
16672 1.0 1.0000 39
16725 1.0 1.0000 39
17269 0.0 1.0000 39
17351 1.0 0.6556 39
17442 1.0 1.0000 39
17842 0.0 1.0000 39
18412 2.0 0.6690 39
18510 1.0 1.0000 39
18731 1.0 1.0000 39
18738 2.0 1.0000 39
No. of records with gender 0 in cluster 39 is 7
No. of records with gender 1 in cluster 39 is 16
No. of records with gender 2 in cluster 39 is 7
Records found in cluster 40 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7381 2.0 1.0000 40
7470 1.0 0.6810 40
7542 0.0 1.0000 40
7616 2.0 0.6675 40
7675 2.0 1.0000 40
... ... ... ...
15207 1.0 1.0000 40
15391 2.0 1.0000 40
15439 2.0 1.0000 40
15622 2.0 1.0000 40
17122 2.0 0.6583 40
[98 rows x 3 columns]
No. of records with gender 0 in cluster 40 is 25
No. of records with gender 1 in cluster 40 is 38
No. of records with gender 2 in cluster 40 is 35
Records found in cluster 41 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7416 2.0 1.0000 41
7417 1.0 1.0000 41
7418 2.0 1.0000 41
7421 2.0 0.6802 41
7429 2.0 1.0000 41
7430 2.0 0.6812 41
7434 2.0 1.0000 41
7440 0.0 1.0000 41
7441 1.0 1.0000 41
7442 2.0 1.0000 41
7448 0.0 1.0000 41
7458 2.0 1.0000 41
7459 2.0 1.0000 41
7496 2.0 0.6703 41
7497 0.0 0.6799 41
No. of records with gender 0 in cluster 41 is 3
No. of records with gender 1 in cluster 41 is 2
No. of records with gender 2 in cluster 41 is 10
Records found in cluster 42 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
7900 2.0 1.0000 42
7908 2.0 1.0000 42
7910 2.0 1.0000 42
7914 2.0 1.0000 42
7933 1.0 1.0000 42
7953 0.0 1.0000 42
7956 1.0 1.0000 42
7958 1.0 1.0000 42
7959 0.0 0.6823 42
7963 2.0 1.0000 42
7964 2.0 1.0000 42
7966 0.0 0.6607 42
7967 2.0 0.6737 42
7968 2.0 1.0000 42
7973 0.0 1.0000 42
7975 0.0 1.0000 42
7976 0.0 1.0000 42
7977 2.0 0.6739 42
7980 2.0 1.0000 42
7987 0.0 1.0000 42
7991 1.0 1.0000 42
7999 2.0 0.6726 42
No. of records with gender 0 in cluster 42 is 7
No. of records with gender 1 in cluster 42 is 4
No. of records with gender 2 in cluster 42 is 11
Records found in cluster 43 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8024 2.0 1.0000 43
8033 0.0 0.6701 43
8039 1.0 1.0000 43
8046 2.0 1.0000 43
8050 2.0 1.0000 43
8052 0.0 0.7050 43
8055 0.0 1.0000 43
8057 1.0 1.0000 43
8058 2.0 1.0000 43
8059 1.0 1.0000 43
8062 1.0 1.0000 43
8063 1.0 1.0000 43
8065 1.0 0.6688 43
8067 2.0 0.3442 43
8068 1.0 1.0000 43
8070 1.0 0.6698 43
8078 0.0 1.0000 43
8081 2.0 1.0000 43
8085 0.0 1.0000 43
8097 0.0 1.0000 43
16604 1.0 1.0000 43
No. of records with gender 0 in cluster 43 is 6
No. of records with gender 1 in cluster 43 is 9
No. of records with gender 2 in cluster 43 is 6
Records found in cluster 44 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8109 1.0 1.0000 44
8112 0.0 1.0000 44
8113 2.0 0.6675 44
8116 2.0 0.6611 44
8118 1.0 1.0000 44
8122 2.0 0.6623 44
8123 2.0 0.6605 44
8128 0.0 1.0000 44
8132 2.0 0.6665 44
8146 1.0 1.0000 44
8159 2.0 1.0000 44
8165 0.0 1.0000 44
8176 1.0 1.0000 44
8177 2.0 1.0000 44
8178 2.0 1.0000 44
8185 2.0 1.0000 44
8190 2.0 0.6735 44
8191 1.0 0.3568 44
8192 2.0 0.6726 44
8199 2.0 1.0000 44
No. of records with gender 0 in cluster 44 is 3
No. of records with gender 1 in cluster 44 is 5
No. of records with gender 2 in cluster 44 is 12
Records found in cluster 45 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8313 0.0 1.0000 45
8322 1.0 1.0000 45
8327 0.0 0.6763 45
8331 2.0 0.6716 45
8333 2.0 1.0000 45
8337 1.0 1.0000 45
8338 0.0 1.0000 45
8339 0.0 1.0000 45
8340 2.0 0.6707 45
8341 1.0 0.6699 45
8353 2.0 0.6650 45
8356 1.0 0.6517 45
8358 2.0 0.6965 45
8384 0.0 1.0000 45
8385 1.0 1.0000 45
8391 0.0 1.0000 45
No. of records with gender 0 in cluster 45 is 6
No. of records with gender 1 in cluster 45 is 5
No. of records with gender 2 in cluster 45 is 5
Records found in cluster 46 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8401 0.0 0.6732 46
8402 2.0 0.6767 46
8403 2.0 0.6575 46
8407 0.0 0.6763 46
8411 1.0 1.0000 46
8412 1.0 0.6900 46
8429 1.0 1.0000 46
8460 2.0 0.6828 46
8466 0.0 1.0000 46
8470 1.0 1.0000 46
8478 0.0 1.0000 46
8479 2.0 0.3625 46
8487 0.0 0.6806 46
8489 0.0 1.0000 46
8496 0.0 1.0000 46
No. of records with gender 0 in cluster 46 is 7
No. of records with gender 1 in cluster 46 is 4
No. of records with gender 2 in cluster 46 is 4
Records found in cluster 47 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8607 2.0 0.6659 47
8613 2.0 1.0000 47
8616 2.0 1.0000 47
8617 2.0 0.6774 47
8619 0.0 0.6647 47
8620 2.0 0.6975 47
8622 0.0 0.6634 47
8623 2.0 0.6778 47
8624 1.0 1.0000 47
8627 2.0 0.6829 47
8632 2.0 1.0000 47
8638 0.0 1.0000 47
8642 2.0 0.6688 47
8645 2.0 0.6778 47
8647 2.0 1.0000 47
8675 2.0 1.0000 47
8676 1.0 0.6602 47
8677 0.0 0.6772 47
8679 2.0 1.0000 47
8680 2.0 1.0000 47
8681 0.0 0.6507 47
8688 2.0 0.3354 47
8690 2.0 1.0000 47
8691 2.0 0.3595 47
8694 2.0 0.6736 47
8699 0.0 1.0000 47
8749 0.0 0.6548 47
No. of records with gender 0 in cluster 47 is 7
No. of records with gender 1 in cluster 47 is 2
No. of records with gender 2 in cluster 47 is 18
Records found in cluster 48 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8701 1.0 1.0000 48
8711 2.0 1.0000 48
8728 0.0 1.0000 48
8732 2.0 0.6946 48
8739 0.0 1.0000 48
8744 2.0 1.0000 48
8746 2.0 0.6916 48
8764 2.0 0.6674 48
8765 1.0 0.6611 48
8767 0.0 1.0000 48
8769 2.0 1.0000 48
8772 0.0 0.6732 48
8777 0.0 1.0000 48
8779 2.0 1.0000 48
8782 1.0 1.0000 48
8783 2.0 1.0000 48
8784 2.0 1.0000 48
9648 0.0 1.0000 48
10111 2.0 1.0000 48
10551 2.0 0.6362 48
10903 1.0 1.0000 48
11265 1.0 1.0000 48
11650 0.0 1.0000 48
12295 0.0 1.0000 48
12731 2.0 1.0000 48
15770 0.0 0.6808 48
16201 2.0 1.0000 48
No. of records with gender 0 in cluster 48 is 9
No. of records with gender 1 in cluster 48 is 5
No. of records with gender 2 in cluster 48 is 13
Records found in cluster 49 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
8804 2.0 0.6561 49
8834 2.0 1.0000 49
8843 0.0 0.3571 49
8844 2.0 1.0000 49
8849 0.0 0.6906 49
8852 0.0 1.0000 49
8854 0.0 1.0000 49
8855 1.0 0.6440 49
8859 2.0 1.0000 49
8864 0.0 0.3421 49
8865 1.0 1.0000 49
8873 0.0 1.0000 49
8874 1.0 1.0000 49
8878 2.0 0.6640 49
8881 0.0 1.0000 49
8884 1.0 0.6612 49
8886 2.0 0.3536 49
17100 1.0 1.0000 49
No. of records with gender 0 in cluster 49 is 7
No. of records with gender 1 in cluster 49 is 5
No. of records with gender 2 in cluster 49 is 6
Records found in cluster 50 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9105 2.0 0.6468 50
9109 0.0 0.6553 50
9112 1.0 1.0000 50
9113 0.0 1.0000 50
9115 2.0 0.6771 50
9118 2.0 0.6712 50
9123 2.0 1.0000 50
9125 2.0 1.0000 50
9130 2.0 0.6741 50
9136 2.0 1.0000 50
9144 2.0 1.0000 50
9150 1.0 1.0000 50
9151 1.0 0.6453 50
9152 0.0 1.0000 50
9165 0.0 1.0000 50
9166 2.0 1.0000 50
9178 2.0 0.6698 50
9190 1.0 1.0000 50
9194 2.0 1.0000 50
9195 1.0 1.0000 50
No. of records with gender 0 in cluster 50 is 4
No. of records with gender 1 in cluster 50 is 5
No. of records with gender 2 in cluster 50 is 11
Records found in cluster 51 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9206 2.0 0.3398 51
9207 2.0 1.0000 51
9212 0.0 1.0000 51
9215 1.0 0.6818 51
9216 2.0 0.6519 51
9217 2.0 0.3376 51
9220 2.0 1.0000 51
9221 2.0 1.0000 51
9225 2.0 1.0000 51
9228 0.0 1.0000 51
9243 0.0 0.3506 51
9249 1.0 0.3542 51
9253 2.0 1.0000 51
9278 1.0 1.0000 51
9280 1.0 1.0000 51
9283 2.0 0.6659 51
9289 2.0 1.0000 51
9293 0.0 1.0000 51
9294 0.0 1.0000 51
11308 2.0 0.6412 51
No. of records with gender 0 in cluster 51 is 5
No. of records with gender 1 in cluster 51 is 4
No. of records with gender 2 in cluster 51 is 11
Records found in cluster 52 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9515 0.0 0.6648 52
10396 1.0 1.0000 52
10608 1.0 1.0000 52
10796 0.0 0.6912 52
10981 0.0 1.0000 52
11477 2.0 1.0000 52
11770 2.0 1.0000 52
12451 2.0 1.0000 52
12803 1.0 0.6667 52
12996 1.0 1.0000 52
13263 2.0 0.6743 52
13436 0.0 1.0000 52
14141 0.0 1.0000 52
14290 0.0 1.0000 52
14473 0.0 1.0000 52
14878 2.0 0.6502 52
15088 0.0 0.6581 52
15727 2.0 1.0000 52
16605 0.0 0.6578 52
16973 0.0 1.0000 52
17197 1.0 1.0000 52
17330 0.0 1.0000 52
17728 1.0 0.6702 52
18071 2.0 1.0000 52
18531 2.0 1.0000 52
No. of records with gender 0 in cluster 52 is 11
No. of records with gender 1 in cluster 52 is 6
No. of records with gender 2 in cluster 52 is 8
Records found in cluster 53 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
9856 2.0 1.0000 53
10150 1.0 1.0000 53
10237 2.0 1.0000 53
10471 1.0 1.0000 53
10633 2.0 0.6545 53
10849 2.0 1.0000 53
10964 2.0 1.0000 53
11050 0.0 1.0000 53
11251 1.0 0.6715 53
11356 2.0 1.0000 53
11429 2.0 1.0000 53
11653 0.0 1.0000 53
11767 2.0 1.0000 53
11842 1.0 1.0000 53
11930 1.0 1.0000 53
12045 1.0 1.0000 53
12284 1.0 1.0000 53
12397 0.0 1.0000 53
12507 2.0 1.0000 53
12659 2.0 1.0000 53
12754 2.0 0.6615 53
No. of records with gender 0 in cluster 53 is 3
No. of records with gender 1 in cluster 53 is 7
No. of records with gender 2 in cluster 53 is 11
Records found in cluster 54 from DBSCAN in Exp 3
gender gender:confidence Cluster_Label
10812 1.0 0.6827 54
12073 1.0 1.0000 54
13106 1.0 0.6574 54
14855 2.0 1.0000 54
15950 2.0 1.0000 54
16388 2.0 1.0000 54
16854 2.0 1.0000 54
17041 1.0 1.0000 54
17154 1.0 1.0000 54
17297 0.0 1.0000 54
17565 1.0 1.0000 54
17677 1.0 1.0000 54
17868 2.0 0.3354 54
18092 0.0 1.0000 54
18246 1.0 1.0000 54
18302 1.0 1.0000 54
18399 0.0 1.0000 54
18527 1.0 1.0000 54
18646 0.0 1.0000 54
18759 0.0 0.6386 54
No. of records with gender 0 in cluster 54 is 5
No. of records with gender 1 in cluster 54 is 10
No. of records with gender 2 in cluster 54 is 5
Records classified as noise
gender gender:confidence Cluster_Label
599 1.0 1.0000 -1
635 1.0 1.0000 -1
1268 2.0 1.0000 -1
1367 1.0 1.0000 -1
1544 0.0 1.0000 -1
2154 1.0 0.6561 -1
2243 2.0 1.0000 -1
2382 1.0 1.0000 -1
2682 1.0 0.6473 -1
2897 2.0 1.0000 -1
3341 1.0 1.0000 -1
3360 1.0 1.0000 -1
3526 1.0 1.0000 -1
3938 2.0 0.6545 -1
4051 2.0 1.0000 -1
4650 2.0 0.3571 -1
5424 0.0 1.0000 -1
5548 2.0 1.0000 -1
6140 2.0 0.6679 -1
6313 1.0 1.0000 -1
6616 1.0 1.0000 -1
6620 2.0 1.0000 -1
7107 2.0 0.6865 -1
7610 2.0 0.6578 -1
7651 0.0 0.6637 -1
8509 2.0 0.6731 -1
8579 2.0 1.0000 -1
8798 1.0 1.0000 -1
8836 0.0 0.6645 -1
9305 2.0 0.6606 -1
11119 1.0 1.0000 -1
11727 2.0 1.0000 -1
12333 1.0 1.0000 -1
12992 0.0 1.0000 -1
13486 2.0 1.0000 -1
14046 0.0 1.0000 -1
14958 2.0 1.0000 -1
15597 1.0 0.3362 -1
16706 0.0 1.0000 -1
17186 1.0 1.0000 -1
17599 0.0 0.6654 -1
18270 0.0 1.0000 -1
---- VISUALIZE THE METRIC EVALUATION ----
REGRESSION¶
In [4]:
# =============================== REGRESSION ======================================
print()
print()
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)
print()
print("=" * 50)
print('Boosted Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# FEATURE IMPORTANCE
print()
print("Performing feature importance analysis...")
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
# print("desc_ column indices:", desc_columns)
# print("text_ column indices:", text_columns)
# print("desc_ array:\n", desc_array)
# print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
col_name = df_preprocessed_reg.columns[i]
new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)
# Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset
# filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
# plotting these columns
def scatterplot_mistaken_points(misclassified_df, X_train, model):
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Set the main title
fig.suptitle(f'{model}\nGender Confidence of "Mistaken" Records', fontsize=16)
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Training Set\nSample Size: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Not Training Set\nSample Size: {len(df_not_in_X_train)}')
plt.tight_layout()
plt.show()
def scatter_plot(y, y_tot_pred, model):
# Plotting more results results
plt.figure(figsize=(10, 8))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Predicted', fontsize=12)
plt.suptitle(model, fontsize=16)
plt.title('Gender Confidence Comparison', fontsize=14)
plt.show()
scatterplot_mistaken_points(misclassified_df, X_train, "Boosted Regression Tree with Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree with Vectorised Text/Desc Features")
# ==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
df_preprocessed_non_text2 = df_preprocessed_non_text.copy()
print(df_preprocessed_non_text)
print()
print("=" * 50)
print('Boosted Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# Get feature importances and plot from the model
print()
print("Performing feature importance analysis...")
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
'Feature': column_names,
'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()
# adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset
# Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text, "Boosted Regression Tree without Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree without Vectorised Text/Desc Features")
# ====================================Analyzing with a linear regression (Least Squares Implementation)====================
print()
print("=" * 50)
print('Linear Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
# run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree with Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
# final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
# identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
misclassified_df_lin_reg = misclassified_df.copy()
scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree with Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree with Vectorised Text/Desc Features")
#================================Lin reg without text=======================================================
#================================Linear regression without text features============================
print()
print("=" * 50)
print('Linear Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)
X_train_lin = sm.add_constant(X_train_non_text)
X_test_lin = sm.add_constant(X_test_non_text)
df_preprocessed_lin = sm.add_constant(df_preprocessed_non_text2)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree without Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()
#final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
#identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree without Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree without Vectorised Text/Desc Features")
# ================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]
scatterplot_mistaken_points(common_df, X_train_lin, "Boosted and Linear Regression Trees (Intersection) with Vectorised Text/Desc Features")
================================================== Boosted Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0266 Mean Squared Error (Test): 0.0290 Mean Squared Error (Total): 0.0280
Performing feature importance analysis...
desc text favorites_per_day retweets_per_day tweets_per_day \
0 0.308771 0.364314 0.021232 0.0 0.121167
profile_created_year tweet_created_year link_R link_G link_B \
0 0.155415 0.0 0.000336 0.011339 0.000434
sidebar_R sidebar_G sidebar_B
0 0.005375 0.006886 0.00473
favorites_per_day retweets_per_day tweets_per_day \
0 0.000000 0.000000 28.149163
1 0.015554 0.000000 1.708829
2 2.147321 0.000279 1.567243
3 0.036207 0.000000 0.303459
4 9.794751 0.000000 8.257743
... ... ... ...
18831 0.090609 0.000000 0.234923
18832 0.568809 0.000000 3.060887
18833 0.011364 0.000000 6.004318
18834 16.333103 0.000000 12.934948
18835 0.878510 0.000000 0.766728
profile_created_year tweet_created_year link_R link_G link_B \
0 2013 2015 8 194 194
1 2012 2015 0 132 180
2 2014 2015 171 184 194
3 2009 2015 0 132 180
4 2014 2015 59 148 217
... ... ... ... ... ...
18831 2015 2015 0 132 180
18832 2012 2015 207 185 41
18833 2012 2015 0 132 180
18834 2012 2015 146 102 204
18835 2014 2015 0 132 180
sidebar_R sidebar_G sidebar_B
0 255 255 255
1 192 222 237
2 192 222 237
3 192 222 237
4 0 0 0
... ... ... ...
18831 192 222 237
18832 0 0 0
18833 192 222 237
18834 0 0 0
18835 192 222 237
[18836 rows x 11 columns]
==================================================
Boosted Regression Tree without Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0275
Mean Squared Error (Test): 0.0292
Mean Squared Error (Total): 0.0280
Performing feature importance analysis...
================================================== Linear Regression Tree with Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0166 Mean Squared Error (Test): 0.0499 Mean Squared Error (Total): 0.0366
================================================== Linear Regression Tree without Vectorised Text/Desc Features ================================================== Mean Squared Error (Train): 0.0292 Mean Squared Error (Test): 0.0305 Mean Squared Error (Total): 0.0300
CLASSIFICATION¶
In [5]:
# ============================== CLASSIFICATION ==============================
print()
print()
print('---- CLASSIFICATION ----')
# Features and target
X = df_preprocessed.drop(columns=['gender']) # Assuming 'gender' is the target variable
y = df_preprocessed['gender']
# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Predict on test data
y_pred_rf = rf_classifier.predict(X_test)
# Evaluate the performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
# Train the model
xgb_model.fit(X_train, y_train)
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
# Evaluate the model
print("\nXGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)
# Fit the model
lgb_clf.fit(X_train, y_train)
# Predict
y_pred_lgb = lgb_clf.predict(X_test)
# Evaluation
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))
# Helper function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, model_name):
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'{model_name} Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Helper function to extract and display classification report with model name
def get_classification_report(y_test, y_pred, model_name):
report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()
df['model'] = model_name
return df
# Random Forest Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
rf_report = get_classification_report(y_test, y_pred_rf, "Random Forest")
# XGBoost Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
xgb_report = get_classification_report(y_test, y_pred_xgb, "XGBoost")
# LightGBM Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_lgb, "LightGBM")
lgb_report = get_classification_report(y_test, y_pred_lgb, "LightGBM")
# Combine all reports
combined_report = pd.concat([rf_report, xgb_report, lgb_report])
# Debugging Step: Check the combined report structure
print("Combined Classification Report:\n", combined_report.head())
# Filter out rows for precision, recall, and f1-score
combined_report_filtered = combined_report[
combined_report.index.isin(['0', '1']) # Filter for the classes
].reset_index()
# Debugging Step: Check the filtered report structure
print("Filtered Report for Precision, Recall, and F1-Score:\n", combined_report_filtered.head())
# Plot Precision, Recall, and F1-Score for each model
metrics = ['precision', 'recall', 'f1-score']
for metric in metrics:
# Debugging Step: Filter for specific metric
print(f"Data for {metric}:")
print(combined_report_filtered[['index', metric, 'model']])
plt.figure(figsize=(10, 6))
sns.barplot(
x="index",
y=metric,
hue="model",
data=combined_report_filtered[['index', metric, 'model']]
)
plt.title(f'{metric.capitalize()} Comparison')
plt.ylabel(metric.capitalize())
plt.xlabel('Class (0 = Human, 1 = Non-Human)')
plt.show()
# Accuracy comparison
accuracies = {
'Random Forest': accuracy_score(y_test, y_pred_rf),
'XGBoost': accuracy_score(y_test, y_pred_xgb),
'LightGBM': accuracy_score(y_test, y_pred_lgb)
}
plt.figure(figsize=(6, 4))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()
---- CLASSIFICATION ----
Accuracy Score: 0.6242038216560509
Confusion Matrix:
[[661 470 136]
[284 932 102]
[250 174 759]]
Classification Report:
precision recall f1-score support
0 0.55 0.52 0.54 1267
1 0.59 0.71 0.64 1318
2 0.76 0.64 0.70 1183
accuracy 0.62 3768
macro avg 0.64 0.62 0.63 3768
weighted avg 0.63 0.62 0.62 3768
XGBoost Classifier Report:
precision recall f1-score support
0 0.56 0.54 0.55 1267
1 0.61 0.65 0.63 1318
2 0.72 0.67 0.69 1183
accuracy 0.62 3768
macro avg 0.63 0.62 0.62 3768
weighted avg 0.62 0.62 0.62 3768
Accuracy: 0.6220806794055201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36890
[LightGBM] [Info] Number of data points in the train set: 15068, number of used features: 1766
[LightGBM] [Info] Start training from score -1.117843
[LightGBM] [Info] Start training from score -1.029513
[LightGBM] [Info] Start training from score -1.152536
LightGBM Classification Report:
precision recall f1-score support
0 0.57 0.55 0.56 1267
1 0.61 0.65 0.63 1318
2 0.72 0.69 0.70 1183
accuracy 0.63 3768
macro avg 0.63 0.63 0.63 3768
weighted avg 0.63 0.63 0.63 3768
Combined Classification Report:
precision recall f1-score support model
0 0.553138 0.521705 0.536962 1267.000000 Random Forest
1 0.591371 0.707132 0.644091 1318.000000 Random Forest
2 0.761284 0.641589 0.696330 1183.000000 Random Forest
accuracy 0.624204 0.624204 0.624204 0.624204 Random Forest
macro avg 0.635264 0.623475 0.625794 3768.000000 Random Forest
Filtered Report for Precision, Recall, and F1-Score:
index precision recall f1-score support model
0 0 0.553138 0.521705 0.536962 1267.0 Random Forest
1 1 0.591371 0.707132 0.644091 1318.0 Random Forest
2 0 0.556275 0.542226 0.549161 1267.0 XGBoost
3 1 0.605356 0.651745 0.627695 1318.0 XGBoost
4 0 0.573061 0.554065 0.563403 1267.0 LightGBM
Data for precision:
index precision model
0 0 0.553138 Random Forest
1 1 0.591371 Random Forest
2 0 0.556275 XGBoost
3 1 0.605356 XGBoost
4 0 0.573061 LightGBM
5 1 0.609497 LightGBM
Data for recall: index recall model 0 0 0.521705 Random Forest 1 1 0.707132 Random Forest 2 0 0.542226 XGBoost 3 1 0.651745 XGBoost 4 0 0.554065 LightGBM 5 1 0.652504 LightGBM
Data for f1-score: index f1-score model 0 0 0.536962 Random Forest 1 1 0.644091 Random Forest 2 0 0.549161 XGBoost 3 1 0.627695 XGBoost 4 0 0.563403 LightGBM 5 1 0.630267 LightGBM
ASSOCIATION RULES¶
In [6]:
# ============================== ASSOCIATION RULES ==============================
print()
print()
print('---- ASSOCIATION RULES ----')
# Binarize numeric columns
df_asso['high_favorites'] = df_asso['favorites_per_day'] > df_asso['favorites_per_day'].mean()
df_asso['high_retweets'] = df_asso['retweets_per_day'] > df_asso['retweets_per_day'].mean()
df_asso['high_tweets'] = df_asso['tweets_per_day'] > df_asso['tweets_per_day'].mean()
# Binarize year columns (profile_created_year and tweet_created_year)
# Example: Set threshold year as 2015
df_asso['profile_recent'] = df_asso['profile_created_year'] >= 2015
df_asso['tweet_recent'] = df_asso['tweet_created_year'] >= 2015
# Select only the binary columns
df_apriori = df_asso[['high_favorites', 'high_retweets', 'high_tweets',
'profile_recent', 'tweet_recent',
'tweet_location_encoded', 'user_timezone_encoded']]
# Convert all columns to int (0 or 1)
df_apriori = df_apriori.astype(int)
# Apply Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)
# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Display the rules
print(rules)
top_frequent_itemsets = frequent_itemsets.nlargest(10, 'support')
plt.figure(figsize=(10, 6))
sns.barplot(x='support', y='itemsets', data=top_frequent_itemsets)
plt.title('Top 10 Frequent Itemsets by Support')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()
# ---------------------------
# Visualization 2: Scatter Plot of Association Rules by Confidence and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(x='confidence', y='lift', size='support', data=rules, hue='antecedents', palette='viridis', sizes=(40, 200))
plt.title('Association Rules: Confidence vs Lift')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.legend(title='Antecedents', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
# ---------------------------
# Visualization 3: Heatmap of Support, Confidence, and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.heatmap(rules[['support', 'confidence', 'lift']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Support, Confidence, and Lift')
plt.show()
---- ASSOCIATION RULES ----
antecedents consequents \
0 (high_tweets) (high_favorites)
1 (high_favorites) (high_tweets)
2 (tweet_recent) (high_favorites)
3 (high_favorites) (tweet_recent)
4 (tweet_recent) (high_tweets)
5 (high_tweets) (tweet_recent)
6 (tweet_recent) (profile_recent)
7 (profile_recent) (tweet_recent)
8 (tweet_recent, high_tweets) (high_favorites)
9 (high_favorites, high_tweets) (tweet_recent)
10 (tweet_recent, high_favorites) (high_tweets)
11 (high_tweets) (tweet_recent, high_favorites)
12 (tweet_recent) (high_favorites, high_tweets)
13 (high_favorites) (tweet_recent, high_tweets)
antecedent support consequent support support confidence lift \
0 0.271767 0.210607 0.066097 0.243212 1.15481
1 0.210607 0.271767 0.066097 0.313839 1.15481
2 1.000000 0.210607 0.210607 0.210607 1.00000
3 0.210607 1.000000 0.210607 1.000000 1.00000
4 1.000000 0.271767 0.271767 0.271767 1.00000
5 0.271767 1.000000 0.271767 1.000000 1.00000
6 1.000000 0.175568 0.175568 0.175568 1.00000
7 0.175568 1.000000 0.175568 1.000000 1.00000
8 0.271767 0.210607 0.066097 0.243212 1.15481
9 0.066097 1.000000 0.066097 1.000000 1.00000
10 0.210607 0.271767 0.066097 0.313839 1.15481
11 0.271767 0.210607 0.066097 0.243212 1.15481
12 1.000000 0.066097 0.066097 0.066097 1.00000
13 0.210607 0.271767 0.066097 0.313839 1.15481
leverage conviction zhangs_metric
0 0.008861 1.043082 0.184085
1 0.008861 1.061316 0.169823
2 0.000000 1.000000 0.000000
3 0.000000 inf 0.000000
4 0.000000 1.000000 0.000000
5 0.000000 inf 0.000000
6 0.000000 1.000000 0.000000
7 0.000000 inf 0.000000
8 0.008861 1.043082 0.184085
9 0.000000 inf 0.000000
10 0.008861 1.061316 0.169823
11 0.008861 1.043082 0.184085
12 0.000000 1.000000 0.000000
13 0.008861 1.061316 0.169823
C:\Users\Owner\uowMaster\subject\946\venv_bda\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type warnings.warn(
AMENDMENT¶
In [9]:
# ============================== AMENDMENT ==============================
print()
print()
print('---- AMENDMENT ----')
mistaken_index = misclassified_df_reg.index.union(misclassified_df_lin_reg.index)
df_truth = df_preprocessed.copy()
df_mistaken = df_preprocessed.loc[mistaken_index].copy()
df_amended = df_mistaken.copy()
vectorized_features = [col for col in df_truth.columns if col.startswith('desc_') or col.startswith('text_')]
df_truth_vectors = df_truth[vectorized_features]
df_mistaken_vectors = df_mistaken[vectorized_features]
similarities = cosine_similarity(df_mistaken_vectors, df_truth_vectors)
best_matches_indices = similarities.argmax(axis=1)
df_amended['gender'] = df_truth.loc[best_matches_indices, 'gender'].values
## Comparative Analysis
# Calculate the number of changes made
num_changes = (df_amended['gender'] != df_mistaken['gender']).sum()
# Calculate the percentage of records amended
percent_amended = (num_changes / len(df_amended)) * 100
## Impact on Statistics
# Function to calculate gender distribution
def gender_distribution(df):
return df['gender'].value_counts(normalize=True) * 100
# Calculate gender distributions
original_dist = gender_distribution(df_mistaken)
amended_dist = gender_distribution(df_amended)
# Calculate the difference in distributions
dist_difference = amended_dist - original_dist
## Summary Report
print("Amendment Summary Report")
print("=======================")
print(f"Total records processed: {len(df_amended)}")
print(f"Number of records amended: {num_changes}")
print(f"Percentage of records amended: {percent_amended:.2f}%")
print("\nGender Distribution (%):")
print("------------------------")
print("Category Mistaken Amended")
for category in original_dist.index:
print(f"{category:<12} {original_dist.get(category, 0):.2f} {amended_dist.get(category, 0):.2f}")
print("\nDistribution Changes:")
print("---------------------")
for category in dist_difference.index:
print(f"{category}: {dist_difference[category]:+.2f}%")
## Create a figure with subplots
fig, axs = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle("Comparative Analysis of Gender Amendment", fontsize=20)
## 1. Bar plot: Gender Distribution Comparison
axs[0, 0].bar(original_dist.index, original_dist.values, alpha=0.5, label='Original')
axs[0, 0].bar(amended_dist.index, amended_dist.values, alpha=0.5, label='Amended')
axs[0, 0].set_title("Gender Distribution Comparison")
axs[0, 0].set_ylabel("Percentage")
axs[0, 0].legend()
## 2. Pie charts: Before and After Amendment
def plot_pie(ax, data, title):
ax.pie(data.values, labels=data.index, autopct='%1.1f%%', startangle=90)
ax.set_title(title)
plot_pie(axs[0, 1], original_dist, "Gender Distribution Before Amendment")
plot_pie(axs[1, 0], amended_dist, "Gender Distribution After Amendment")
## 3. Heatmap: Confusion Matrix
cm = confusion_matrix(df_mistaken['gender'], df_amended['gender'], labels=df_mistaken['gender'].unique())
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=df_mistaken['gender'].unique(),
yticklabels=df_mistaken['gender'].unique(), ax=axs[1, 1])
axs[1, 1].set_title("Confusion Matrix: After vs Before")
axs[1, 1].set_xlabel("After")
axs[1, 1].set_ylabel("Before")
## Adjust layout and save
plt.tight_layout()
plt.savefig('gender_amendment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
print("Visualizations have been saved as 'gender_amendment_analysis.png'")
---- AMENDMENT ---- Amendment Summary Report ======================= Total records processed: 3682 Number of records amended: 92 Percentage of records amended: 2.50% Gender Distribution (%): ------------------------ Category Mistaken Amended 1 35.99 34.95 2 33.49 34.71 0 30.53 30.34 Distribution Changes: --------------------- 1: -1.03% 2: +1.22% 0: -0.19%
Visualizations have been saved as 'gender_amendment_analysis.png'