import sys
import os
import subprocess
import nltk
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
#from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.metrics import mean_squared_error
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Quick view of the dataset
print('The information of the dataset')
print(df.info())
print('The first few rows of the dataset')
print(df.head())
all_features = df.columns
#Finding features that have a lot of missing data
def find_columns_with_missing(data, columns):
missing = []
i = 0
for col in columns:
missing.append(data[col].isnull().sum())
print(f'the {col} has {missing[i]} data missing')
print(f'the proportion of missing data to the total is {missing[i]/len(data)}')
if missing[i]/len(data) >= 0.9:
print(f'The feature to be dropped is {col}')
data = data.drop(columns=col)
data_cleaned = data
i += 1
return missing, data_cleaned
missing_col, df_cleaned = find_columns_with_missing(df, all_features)
missing_col
print('The information of the cleaned dataset')
print(df_cleaned.info())
print('The first few rows of the cleaned dataset')
print(df_cleaned.head())
# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis
print('The information of the cleaned dataset')
print(df_cleaned.info())
print('The first few rows of the cleaned dataset')
print(df_cleaned.head())
# Exploratory Data Analysis (EDA)
current_num_features = df.select_dtypes(include=[np.number])
# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
plt.title(f'Distribution of {feature} by Gender')
plt.show()
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
plt.figure(figsize=(8, 6))
sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
plt.show()
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
#assuming the data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days
df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()
# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()
#show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()
# Exploring 'link_color' and 'sidebar_color' features
#Check number of NaN value in 'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}")
#Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'the number of link color is {link_color_count}')
print(f'the number of side bar color is {sidebar_color_count}')
# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")
#top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
#print(top_sidebar_colors)
# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()
# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()
# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()
# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)
# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']
#categorise types of features
#numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()
#categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)
freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)
df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
#gender features
#encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Check for unique values in the 'gender' column after replacement
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()
df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()
# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)
# Function to convert hex to RGB
def hex_to_rgb(hex_color):
# Remove the '#' if it exists
hex_color = hex_color.lstrip('#')
# Convert hex to integer and split into RGB components
return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]
# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)
#Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])
#keep the gender confidence preprocessed to be able to use it in regression task
preprocessed_gender_conf = df_preprocessed["gender:confidence"].copy()
#Check if all required features are there
print(f'All features that will be used are {df_preprocessed.columns.tolist()}')
# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
#print(f'All current numerical features are {numerical_features.columns.tolist()}')
print('After all, here is the information of the dataset')
print(df_preprocessed.info())
# NLP Processing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
#df_preprocessed['name'].fillna('', inplace=True)
#Check the text features if they still contain NaN
print(df_preprocessed.select_dtypes(include=[object]))
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Preprocessing function
def preprocess_text(text):
text = text.lower()
#Remove punctuation and special characters
text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuation
text = re.sub(r'[^A-Za-z\s]', '', text)
#Tokenize the text
tokens = word_tokenize(text)
#Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
#Lemmatize the tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
#Join tokens back into a string
return ' '.join(tokens)
# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
#df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))
# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())
#Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])
#Check the preprocessed dataset in the present
print('The current information of pre-processed dataset before text preprocessing')
print(df_preprocessed.info())
# Initialize TFIDF vectorizer for text features
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
# Apply TF-IDF on 'description', 'text', 'name' columns
tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
#tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()
# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
#tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])
# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)
#Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])
df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)
df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)
print(df_preprocessed.head())
The information of the dataset <class 'pandas.core.frame.DataFrame'> RangeIndex: 20050 entries, 0 to 20049 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 20050 non-null int64 1 _golden 20050 non-null bool 2 _unit_state 20050 non-null object 3 _trusted_judgments 20050 non-null int64 4 _last_judgment_at 20000 non-null object 5 gender 19953 non-null object 6 gender:confidence 20024 non-null float64 7 profile_yn 20050 non-null object 8 profile_yn:confidence 20050 non-null float64 9 created 20050 non-null object 10 description 16306 non-null object 11 fav_number 20050 non-null int64 12 gender_gold 50 non-null object 13 link_color 20050 non-null object 14 name 20050 non-null object 15 profile_yn_gold 50 non-null object 16 profileimage 20050 non-null object 17 retweet_count 20050 non-null int64 18 sidebar_color 20050 non-null object 19 text 20050 non-null object 20 tweet_coord 159 non-null object 21 tweet_count 20050 non-null int64 22 tweet_created 20050 non-null object 23 tweet_id 20050 non-null float64 24 tweet_location 12565 non-null object 25 user_timezone 12252 non-null object dtypes: bool(1), float64(3), int64(5), object(17) memory usage: 3.8+ MB None The first few rows of the dataset _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \ 0 815719226 False finalized 3 10/26/15 23:24 1 815719227 False finalized 3 10/26/15 23:30 2 815719228 False finalized 3 10/26/15 23:33 3 815719229 False finalized 3 10/26/15 23:10 4 815719230 False finalized 3 10/27/15 1:15 gender gender:confidence profile_yn profile_yn:confidence \ 0 male 1.0000 yes 1.0 1 male 1.0000 yes 1.0 2 male 0.6625 yes 1.0 3 male 1.0000 yes 1.0 4 female 1.0000 yes 1.0 created ... profileimage \ 0 12/5/13 1:48 ... https://pbs.twimg.com/profile_images/414342229... 1 10/1/12 13:51 ... https://pbs.twimg.com/profile_images/539604221... 2 11/28/14 11:30 ... https://pbs.twimg.com/profile_images/657330418... 3 6/11/09 22:39 ... https://pbs.twimg.com/profile_images/259703936... 4 4/16/14 13:23 ... https://pbs.twimg.com/profile_images/564094871... retweet_count sidebar_color \ 0 0 FFFFFF 1 0 C0DEED 2 1 C0DEED 3 0 C0DEED 4 0 0 text tweet_coord tweet_count \ 0 Robbie E Responds To Critics After Win Against... NaN 110964 1 ÛÏIt felt like they were my friends and I was... NaN 7471 2 i absolutely adore when louis starts the songs... NaN 5617 3 Hi @JordanSpieth - Looking at the url - do you... NaN 1693 4 Watching Neighbours on Sky+ catching up with t... NaN 31462 tweet_created tweet_id tweet_location user_timezone 0 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai 1 10/26/15 12:40 6.587300e+17 NaN Eastern Time (US & Canada) 2 10/26/15 12:40 6.587300e+17 clcncl Belgrade 3 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada) 4 10/26/15 12:40 6.587300e+17 NaN NaN [5 rows x 26 columns] the _unit_id has 0 data missing the proportion of missing data to the total is 0.0 the _golden has 0 data missing the proportion of missing data to the total is 0.0 the _unit_state has 0 data missing the proportion of missing data to the total is 0.0 the _trusted_judgments has 0 data missing the proportion of missing data to the total is 0.0 the _last_judgment_at has 50 data missing the proportion of missing data to the total is 0.0024937655860349127 the gender has 97 data missing the proportion of missing data to the total is 0.00483790523690773 the gender:confidence has 26 data missing the proportion of missing data to the total is 0.0012967581047381546 the profile_yn has 0 data missing the proportion of missing data to the total is 0.0 the profile_yn:confidence has 0 data missing the proportion of missing data to the total is 0.0 the created has 0 data missing the proportion of missing data to the total is 0.0 the description has 3744 data missing the proportion of missing data to the total is 0.18673316708229426 the fav_number has 0 data missing the proportion of missing data to the total is 0.0 the gender_gold has 20000 data missing the proportion of missing data to the total is 0.9975062344139651 The feature to be dropped is gender_gold the link_color has 0 data missing the proportion of missing data to the total is 0.0 the name has 0 data missing the proportion of missing data to the total is 0.0 the profile_yn_gold has 20000 data missing the proportion of missing data to the total is 0.9975062344139651 The feature to be dropped is profile_yn_gold the profileimage has 0 data missing the proportion of missing data to the total is 0.0 the retweet_count has 0 data missing the proportion of missing data to the total is 0.0 the sidebar_color has 0 data missing the proportion of missing data to the total is 0.0 the text has 0 data missing the proportion of missing data to the total is 0.0 the tweet_coord has 19891 data missing the proportion of missing data to the total is 0.992069825436409 The feature to be dropped is tweet_coord the tweet_count has 0 data missing the proportion of missing data to the total is 0.0 the tweet_created has 0 data missing the proportion of missing data to the total is 0.0 the tweet_id has 0 data missing the proportion of missing data to the total is 0.0 the tweet_location has 7485 data missing the proportion of missing data to the total is 0.3733167082294264 the user_timezone has 7798 data missing the proportion of missing data to the total is 0.388927680798005 The information of the cleaned dataset <class 'pandas.core.frame.DataFrame'> RangeIndex: 20050 entries, 0 to 20049 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 20050 non-null int64 1 _golden 20050 non-null bool 2 _unit_state 20050 non-null object 3 _trusted_judgments 20050 non-null int64 4 _last_judgment_at 20000 non-null object 5 gender 19953 non-null object 6 gender:confidence 20024 non-null float64 7 profile_yn 20050 non-null object 8 profile_yn:confidence 20050 non-null float64 9 created 20050 non-null object 10 description 16306 non-null object 11 fav_number 20050 non-null int64 12 link_color 20050 non-null object 13 name 20050 non-null object 14 profileimage 20050 non-null object 15 retweet_count 20050 non-null int64 16 sidebar_color 20050 non-null object 17 text 20050 non-null object 18 tweet_count 20050 non-null int64 19 tweet_created 20050 non-null object 20 tweet_id 20050 non-null float64 21 tweet_location 12565 non-null object 22 user_timezone 12252 non-null object dtypes: bool(1), float64(3), int64(5), object(14) memory usage: 3.4+ MB None The first few rows of the cleaned dataset _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \ 0 815719226 False finalized 3 10/26/15 23:24 1 815719227 False finalized 3 10/26/15 23:30 2 815719228 False finalized 3 10/26/15 23:33 3 815719229 False finalized 3 10/26/15 23:10 4 815719230 False finalized 3 10/27/15 1:15 gender gender:confidence profile_yn profile_yn:confidence \ 0 male 1.0000 yes 1.0 1 male 1.0000 yes 1.0 2 male 0.6625 yes 1.0 3 male 1.0000 yes 1.0 4 female 1.0000 yes 1.0 created ... name \ 0 12/5/13 1:48 ... sheezy0 1 10/1/12 13:51 ... DavdBurnett 2 11/28/14 11:30 ... lwtprettylaugh 3 6/11/09 22:39 ... douggarland 4 4/16/14 13:23 ... WilfordGemma profileimage retweet_count \ 0 https://pbs.twimg.com/profile_images/414342229... 0 1 https://pbs.twimg.com/profile_images/539604221... 0 2 https://pbs.twimg.com/profile_images/657330418... 1 3 https://pbs.twimg.com/profile_images/259703936... 0 4 https://pbs.twimg.com/profile_images/564094871... 0 sidebar_color text \ 0 FFFFFF Robbie E Responds To Critics After Win Against... 1 C0DEED ÛÏIt felt like they were my friends and I was... 2 C0DEED i absolutely adore when louis starts the songs... 3 C0DEED Hi @JordanSpieth - Looking at the url - do you... 4 0 Watching Neighbours on Sky+ catching up with t... tweet_count tweet_created tweet_id tweet_location \ 0 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 1 7471 10/26/15 12:40 6.587300e+17 NaN 2 5617 10/26/15 12:40 6.587300e+17 clcncl 3 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA 4 31462 10/26/15 12:40 6.587300e+17 NaN user_timezone 0 Chennai 1 Eastern Time (US & Canada) 2 Belgrade 3 Pacific Time (US & Canada) 4 NaN [5 rows x 23 columns] The information of the cleaned dataset <class 'pandas.core.frame.DataFrame'> Index: 19953 entries, 0 to 20049 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 19953 non-null int64 1 _golden 19953 non-null bool 2 _unit_state 19953 non-null object 3 _trusted_judgments 19953 non-null int64 4 _last_judgment_at 19903 non-null object 5 gender 19953 non-null object 6 gender:confidence 19953 non-null float64 7 profile_yn:confidence 19953 non-null float64 8 created 19953 non-null object 9 description 16224 non-null object 10 fav_number 19953 non-null int64 11 link_color 19953 non-null object 12 name 19953 non-null object 13 profileimage 19953 non-null object 14 retweet_count 19953 non-null int64 15 sidebar_color 19953 non-null object 16 text 19953 non-null object 17 tweet_count 19953 non-null int64 18 tweet_created 19953 non-null object 19 tweet_id 19953 non-null float64 20 tweet_location 12510 non-null object 21 user_timezone 12185 non-null object dtypes: bool(1), float64(3), int64(5), object(13) memory usage: 3.4+ MB None The first few rows of the cleaned dataset _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \ 0 815719226 False finalized 3 10/26/15 23:24 1 815719227 False finalized 3 10/26/15 23:30 2 815719228 False finalized 3 10/26/15 23:33 3 815719229 False finalized 3 10/26/15 23:10 4 815719230 False finalized 3 10/27/15 1:15 gender gender:confidence profile_yn:confidence created \ 0 male 1.0000 1.0 12/5/13 1:48 1 male 1.0000 1.0 10/1/12 13:51 2 male 0.6625 1.0 11/28/14 11:30 3 male 1.0000 1.0 6/11/09 22:39 4 female 1.0000 1.0 4/16/14 13:23 description ... name \ 0 i sing my own rhythm. ... sheezy0 1 I'm the author of novels filled with family dr... ... DavdBurnett 2 louis whining and squealing and all ... lwtprettylaugh 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... douggarland 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... WilfordGemma profileimage retweet_count \ 0 https://pbs.twimg.com/profile_images/414342229... 0 1 https://pbs.twimg.com/profile_images/539604221... 0 2 https://pbs.twimg.com/profile_images/657330418... 1 3 https://pbs.twimg.com/profile_images/259703936... 0 4 https://pbs.twimg.com/profile_images/564094871... 0 sidebar_color text \ 0 FFFFFF Robbie E Responds To Critics After Win Against... 1 C0DEED ÛÏIt felt like they were my friends and I was... 2 C0DEED i absolutely adore when louis starts the songs... 3 C0DEED Hi @JordanSpieth - Looking at the url - do you... 4 0 Watching Neighbours on Sky+ catching up with t... tweet_count tweet_created tweet_id tweet_location \ 0 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 1 7471 10/26/15 12:40 6.587300e+17 NaN 2 5617 10/26/15 12:40 6.587300e+17 clcncl 3 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA 4 31462 10/26/15 12:40 6.587300e+17 NaN user_timezone 0 Chennai 1 Eastern Time (US & Canada) 2 Belgrade 3 Pacific Time (US & Canada) 4 NaN [5 rows x 22 columns]
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:77: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:78: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:81: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce') C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:82: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
Number of NaN values in 'link_color': 0 Number of NaN values in 'sidebar_color': 0 the number of link color is 2986 the number of side bar color is 559 Number of NaN values in 'link_color': 0 Number of NaN values in 'sidebar_color': 0
[0 1 2] <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 link_color 18836 non-null object 5 retweets_per_day 18836 non-null float64 6 sidebar_color 18836 non-null object 7 text 18836 non-null object 8 tweets_per_day 18836 non-null float64 9 user_timezone 18836 non-null object 10 tweet_location 18836 non-null object 11 profile_created_year 18836 non-null int32 12 tweet_created_year 18836 non-null int32 13 tweet_location_encoded 18836 non-null float64 14 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(6) memory usage: 2.2+ MB None
All features that will be used are ['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded'] After all, here is the information of the dataset <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 description 15522 non-null object 3 favorites_per_day 18836 non-null float64 4 retweets_per_day 18836 non-null float64 5 text 18836 non-null object 6 tweets_per_day 18836 non-null float64 7 profile_created_year 18836 non-null int32 8 tweet_created_year 18836 non-null int32 9 tweet_location_encoded 18836 non-null float64 10 user_timezone_encoded 18836 non-null float64 dtypes: float64(6), int32(2), int64(1), object(2) memory usage: 1.6+ MB None description \ 0 i sing my own rhythm. 1 I'm the author of novels filled with family dr... 2 louis whining and squealing and all 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... ... 20045 (rp) 20046 Whatever you like, it's not a problem at all. ... 20047 #TeamBarcelona ..You look lost so you should f... 20048 Anti-statist; I homeschool my kids. Aspiring t... 20049 Teamwork makes the dream work. text 0 Robbie E Responds To Critics After Win Against... 1 ÛÏIt felt like they were my friends and I was... 2 i absolutely adore when louis starts the songs... 3 Hi @JordanSpieth - Looking at the url - do you... 4 Watching Neighbours on Sky+ catching up with t... ... ... 20045 @lookupondeath ...Fine, and I'll drink tea too... 20046 Greg Hardy you a good player and all but don't... 20047 You can miss people and still never want to se... 20048 @bitemyapp i had noticed your tendency to pee ... 20049 I think for my APUSH creative project I'm goin... [18836 rows x 2 columns]
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Shahl\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Shahl\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] C:\Users\Shahl\AppData\Roaming\nltk_data... [nltk_data] Package punkt_tab is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Shahl\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
description \ 0 i sing my own rhythm. 1 I'm the author of novels filled with family dr... 2 louis whining and squealing and all 3 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... 4 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... cleaned_description \ 0 sing rhythm 1 im author novel filled family drama romance 2 louis whining squealing 3 mobile guy er shazam google kleiner perkins ya... 4 ricky wilson best frontmankaiser chief best ba... text \ 0 Robbie E Responds To Critics After Win Against... 1 ÛÏIt felt like they were my friends and I was... 2 i absolutely adore when louis starts the songs... 3 Hi @JordanSpieth - Looking at the url - do you... 4 Watching Neighbours on Sky+ catching up with t... cleaned_text 0 robbie e responds critic win eddie edward worl... 1 felt like friend living story httpstcoarngeyhn... 2 absolutely adore louis start song hit hard fee... 3 hi jordanspieth looking url use ifttt dont typ... 4 watching neighbour sky catching neighbs xxx xxx The current information of pre-processed dataset before text preprocessing <class 'pandas.core.frame.DataFrame'> Index: 18836 entries, 0 to 20049 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 18836 non-null int64 1 gender:confidence 18836 non-null float64 2 favorites_per_day 18836 non-null float64 3 retweets_per_day 18836 non-null float64 4 tweets_per_day 18836 non-null float64 5 profile_created_year 18836 non-null int32 6 tweet_created_year 18836 non-null int32 7 tweet_location_encoded 18836 non-null float64 8 user_timezone_encoded 18836 non-null float64 9 cleaned_description 18836 non-null object 10 cleaned_text 18836 non-null object dtypes: float64(6), int32(2), int64(1), object(2) memory usage: 1.6+ MB None gender gender:confidence favorites_per_day retweets_per_day \ 0 0 1.0000 0.000000 0.000000 1 0 1.0000 0.015561 0.000000 2 0 0.6625 2.148520 0.000279 3 0 1.0000 0.036220 0.000000 4 1 1.0000 9.799895 0.000000 tweets_per_day profile_created_year tweet_created_year \ 0 28.163452 2013 2015 1 1.709611 2012 2015 2 1.568118 2014 2015 3 0.303568 2009 2015 4 8.262080 2014 2015 tweet_location_encoded user_timezone_encoded desc_0 ... link_B \ 0 0.000053 0.001699 0.0 ... 194 1 0.363294 0.127309 0.0 ... 180 2 0.000053 0.002071 0.0 ... 194 3 0.000159 0.105755 0.0 ... 180 4 0.363294 0.381344 0.0 ... 217 sidebar_R sidebar_G sidebar_B link_R link_G link_B sidebar_R \ 0 255 255 255 8 194 194 255 1 192 222 237 0 132 180 192 2 192 222 237 171 184 194 192 3 192 222 237 0 132 180 192 4 0 0 0 59 148 217 0 sidebar_G sidebar_B 0 255 255 1 222 237 2 222 237 3 222 237 4 0 0 [5 rows x 3021 columns]
df_preprocessed.head()
gender | gender:confidence | favorites_per_day | retweets_per_day | tweets_per_day | profile_created_year | tweet_created_year | tweet_location_encoded | user_timezone_encoded | desc_0 | ... | link_B | sidebar_R | sidebar_G | sidebar_B | link_R | link_G | link_B | sidebar_R | sidebar_G | sidebar_B | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0000 | 0.000000 | 0.000000 | 28.163452 | 2013 | 2015 | 0.000053 | 0.001699 | 0.0 | ... | 194 | 255 | 255 | 255 | 8 | 194 | 194 | 255 | 255 | 255 |
1 | 0 | 1.0000 | 0.015561 | 0.000000 | 1.709611 | 2012 | 2015 | 0.363294 | 0.127309 | 0.0 | ... | 180 | 192 | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 |
2 | 0 | 0.6625 | 2.148520 | 0.000279 | 1.568118 | 2014 | 2015 | 0.000053 | 0.002071 | 0.0 | ... | 194 | 192 | 222 | 237 | 171 | 184 | 194 | 192 | 222 | 237 |
3 | 0 | 1.0000 | 0.036220 | 0.000000 | 0.303568 | 2009 | 2015 | 0.000159 | 0.105755 | 0.0 | ... | 180 | 192 | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 |
4 | 1 | 1.0000 | 9.799895 | 0.000000 | 8.262080 | 2014 | 2015 | 0.363294 | 0.381344 | 0.0 | ... | 217 | 0 | 0 | 0 | 59 | 148 | 217 | 0 | 0 | 0 |
5 rows × 3021 columns
## Complete py code ¨
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
#finish preprocessing for regression
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for boosted regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()
#FEATURE IMPORTANCE
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
print("desc_ column indices:", desc_columns)
print("text_ column indices:", text_columns)
print("desc_ array:\n", desc_array)
print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
col_name = df_preprocessed_reg.columns[i]
new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)
#Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.title('Feature Importances for boosted regression tree with vectorized text / desc features')
plt.show()
#preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset
#filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
#plotting these columns
def scatterplot_mistaken_points(misclassified_df, X_train):
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
plt.tight_layout()
plt.show()
def scatter_plot(y, y_tot_pred, model):
#Plotting more results results
plt.figure(figsize=(8, 6))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence' + model)
plt.show()
scatterplot_mistaken_points(misclassified_df, X_train)
scatter_plot(y, y_tot_pred, "for boosted regression tree with vectorized text / desc features")
#==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
print(df_preprocessed_non_text)
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for boosted regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()
# Get feature importances and plot from the model
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
'Feature': column_names,
'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importances for boosted regression tree without vectorized text / desc features ')
plt.show()
#adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset
#Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text)
scatter_plot(y, y_tot_pred, "for boosted regression tree without vectorized text / desc features")
#====================================Analyzing with a linear regression (Least Squares Implementation)====================
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)
# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)
print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")
# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for linear regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()
#final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
#identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
scatter_plot(y, y_lin_tot_pred, "for linear regression with vectorized text / description features")
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
# Adjust layout
plt.tight_layout()
#================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]
scatterplot_mistaken_points(common_df, X_train_lin)
Mean Squared Error train: 0.026508391442518143 Mean Squared Error test: 0.028929165530200335 Mean Squared Error total: 0.027960907302519423
desc_ column indices: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506] text_ column indices: [1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006] desc_ array: [0. 0. 0. ... 0. 0. 0.] text_ array: [0. 0. 0. ... 0.00592981 0. 0. ] desc text favorites_per_day retweets_per_day tweets_per_day \ 0 0.301743 0.354858 0.02181 0.000411 0.118526 profile_created_year tweet_created_year tweet_location_encoded \ 0 0.153808 0.0 0.010426 user_timezone_encoded link_R link_G link_B sidebar_R \ 0 0.00819 2.804551e-08 0.010901 0.000417 0.000347 sidebar_G sidebar_B 0 0.008443 0.002519
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:101: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
favorites_per_day retweets_per_day tweets_per_day \ 0 0.000000 0.000000 28.163452 1 0.015561 0.000000 1.709611 2 2.148520 0.000279 1.568118 3 0.036220 0.000000 0.303568 4 9.799895 0.000000 8.262080 ... ... ... ... 18831 0.090663 0.000000 0.235065 18832 0.569067 0.000000 3.062274 18833 0.011369 0.000000 6.007049 18834 16.340642 0.000000 12.940919 18835 0.878971 0.000000 0.767130 profile_created_year tweet_created_year tweet_location_encoded \ 0 2013 2015 0.000053 1 2012 2015 0.363294 2 2014 2015 0.000053 3 2009 2015 0.000159 4 2014 2015 0.363294 ... ... ... ... 18831 2015 2015 0.000106 18832 2012 2015 0.000531 18833 2012 2015 0.000106 18834 2012 2015 0.000106 18835 2014 2015 0.363294 user_timezone_encoded link_R link_G link_B sidebar_R sidebar_G \ 0 0.001699 8 194 194 255 255 1 0.127309 0 132 180 192 222 2 0.002071 171 184 194 192 222 3 0.105755 0 132 180 192 222 4 0.381344 59 148 217 0 0 ... ... ... ... ... ... ... 18831 0.381344 0 132 180 192 222 18832 0.381344 207 185 41 0 0 18833 0.381344 0 132 180 192 222 18834 0.381344 146 102 204 0 0 18835 0.381344 0 132 180 192 222 sidebar_B link_R link_G link_B sidebar_R sidebar_G sidebar_B 0 255 8 194 194 255 255 255 1 237 0 132 180 192 222 237 2 237 171 184 194 192 222 237 3 237 0 132 180 192 222 237 4 0 59 148 217 0 0 0 ... ... ... ... ... ... ... ... 18831 237 0 132 180 192 222 237 18832 0 207 185 41 0 0 0 18833 237 0 132 180 192 222 237 18834 0 146 102 204 0 0 0 18835 237 0 132 180 192 222 237 [18836 rows x 19 columns] Mean Squared Error train: 0.027405575319738522 Mean Squared Error test: 0.029051941898659762 Mean Squared Error total: 0.027960907302519423
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:101: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
Mean Squared Error train: 0.016637954003891314 Mean Squared Error test: 0.04996836851802664 Mean Squared Error total: 0.03663691051476185
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:247: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)
df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.01) & (df_preprocessed_lin["gender_confidence_pred"] < 0.5)]
favorites_per_day | retweets_per_day | tweets_per_day | profile_created_year | tweet_created_year | tweet_location_encoded | user_timezone_encoded | desc_0 | desc_1 | desc_2 | ... | sidebar_G | sidebar_B | link_R | link_G | link_B | sidebar_R | sidebar_G | sidebar_B | difference | gender_confidence_pred | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50 | 0.003124 | 0.000000 | 37.098412 | 2014 | 2015 | 0.000053 | 0.009397 | 0.0 | 0.0 | 0.0 | ... | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 | 0.633508 | 0.366492 |
795 | 0.257622 | 0.000000 | 4.290771 | 2011 | 2015 | 0.000053 | 0.001699 | 0.0 | 0.0 | 0.0 | ... | 255 | 255 | 35 | 8 | 143 | 255 | 255 | 255 | 0.594003 | 0.405997 |
980 | 4.412043 | 0.000215 | 13.829892 | 2011 | 2015 | 0.000053 | 0.002230 | 0.0 | 0.0 | 0.0 | ... | 255 | 255 | 87 | 21 | 21 | 255 | 255 | 255 | 0.610040 | 0.389960 |
1189 | 0.091177 | 0.000000 | 0.173798 | 2015 | 2015 | 0.000053 | 0.127309 | 0.0 | 0.0 | 0.0 | ... | 157 | 94 | 208 | 43 | 85 | 130 | 157 | 94 | 0.505995 | 0.494005 |
1330 | 0.048406 | 0.000000 | 4.700256 | 2012 | 2015 | 0.000053 | 0.065831 | 0.0 | 0.0 | 0.0 | ... | 238 | 238 | 221 | 46 | 68 | 238 | 238 | 238 | 0.637793 | 0.362207 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18139 | 0.000000 | 0.000000 | 23.618518 | 2013 | 2015 | 0.363294 | 0.000319 | 0.0 | 0.0 | 0.0 | ... | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 | 0.532139 | 0.467861 |
18229 | 0.000604 | 0.000000 | 33.945350 | 2015 | 2015 | 0.000106 | 0.105755 | 0.0 | 0.0 | 0.0 | ... | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 | 0.193740 | 0.471160 |
18230 | 0.000602 | 0.000000 | 32.000602 | 2015 | 2015 | 0.363294 | 0.381344 | 0.0 | 0.0 | 0.0 | ... | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 | 0.210192 | 0.470708 |
18474 | 0.000000 | 0.000000 | 0.240339 | 2015 | 2015 | 0.000053 | 0.381344 | 0.0 | 0.0 | 0.0 | ... | 222 | 237 | 0 | 132 | 180 | 192 | 222 | 237 | 0.537172 | 0.462828 |
18550 | 0.191693 | 0.000000 | 15.421903 | 2009 | 2015 | 0.000053 | 0.024740 | 0.0 | 0.0 | 0.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.551622 | 0.448378 |
98 rows × 3021 columns
Regression Models¶
#finish preprocessing for regression
df_preprocessed = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed = df_preprocessed.drop(['gender', "gender:confidence"], axis=1)
#set our regression target values
y = preprocessed_gender_conf.reset_index(drop=True)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed, y, test_size=0.7, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
GradientBoostingRegressor(n_estimators=50, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=50, random_state=42)
Analysing results¶
from sklearn.metrics import mean_squared_error
# Make predictions
y_pred = boosted_reg.predict(X_test)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.028950573206351155
y_tot_pred = boosted_reg.predict(df_preprocessed)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02811897878896962
#checking feature importance
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
print("desc_ column indices:", desc_columns)
print("text_ column indices:", text_columns)
print("desc_ array:\n", desc_array)
print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
col_name = df_preprocessed.columns[i]
new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)
desc_ column indices: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506] text_ column indices: [1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006] desc_ array: [0. 0. 0. ... 0. 0. 0.] text_ array: [0. 0. 0. ... 0.01125915 0. 0. ] desc text favorites_per_day retweets_per_day tweets_per_day \ 0 0.310651 0.362809 0.025296 0.0 0.115279 profile_created_year tweet_created_year tweet_location_encoded \ 0 0.130858 0.0 0.002709 user_timezone_encoded link_R link_G link_B sidebar_R \ 0 0.009664 0.011811 0.02111 0.000145 1.970672e-08 sidebar_G sidebar_B 0 0.002433 0.000798
# Convert DataFrame to a long format suitable for Seaborn
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
# Create bar plot using Seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
# Add a title
plt.title('Feature Importances')
# Show the plot
plt.show()
df_preprocessed_diff = df_preprocessed.copy()
df_preprocessed_diff['abs_difference'] = (abs(y.to_numpy() - y_tot_pred))
df_preprocessed_diff["abs_difference"].describe()
count 18836.000000 mean 0.138667 std 0.094311 min 0.001569 25% 0.079613 50% 0.094619 75% 0.168580 max 0.600668 Name: abs_difference, dtype: float64
# Find samples where the abs difference os over .2
misclassified_df = df_preprocessed_diff[df_preprocessed_diff["abs_difference"] > 0.2]
#Further filter based on indices in X_train.index
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
print(non_train_misclassify["abs_difference"].describe())
print(train_misclassify["abs_difference"].describe())
print(misclassified_df["abs_difference"].describe())
count 1160.000000 mean 0.278931 std 0.096486 min 0.200279 25% 0.230187 50% 0.245285 75% 0.262481 max 0.591468 Name: abs_difference, dtype: float64 count 2782.000000 mean 0.288046 std 0.112172 min 0.200085 25% 0.231173 50% 0.246618 75% 0.264480 max 0.600668 Name: abs_difference, dtype: float64 count 3942.000000 mean 0.285364 std 0.107861 min 0.200085 25% 0.230767 50% 0.246089 75% 0.263885 max 0.600668 Name: abs_difference, dtype: float64
Check where the gender confidence in regression is below a threshold¶
Notes¶
Instead of taking the difference take everywhere where the model has much lower gender confidence then the dataset
WHAT WE WANT IS PLACES WHERE REGRESSION CONF IS LOWER THEN NON REGRESSION CONDFIDENCE
#adding the prediction to the dataset
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
#adding the dataset gender confidence
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset
print(df_preprocessed_diff["gender_confidence_pred"].describe())
print(y.describe())
count 18836.000000 mean 0.900914 std 0.030007 min 0.649431 25% 0.895038 50% 0.909245 75% 0.920387 max 0.947084 Name: gender_confidence_pred, dtype: float64 count 18836.000000 mean 0.900997 std 0.172980 min 0.314000 25% 0.686475 50% 1.000000 75% 1.000000 max 1.000000 Name: gender:confidence, dtype: float64
df_preprocessed_diff["difference"] = y.to_numpy() - y_tot_pred
df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.15) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
favorites_per_day | retweets_per_day | tweets_per_day | tweet_id | profile_created_year | tweet_created_year | tweet_location_encoded | user_timezone_encoded | desc_0 | desc_1 | ... | text_1494 | text_1495 | text_1496 | text_1497 | text_1498 | text_1499 | abs_difference | gender_confidence_pred | gender:confidence | difference | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100 | 0.079670 | 0.000000 | 0.269384 | 6.587300e+17 | 2014 | 2015 | 0.363294 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.163436 | 0.836564 | 1.0 | 0.163436 |
102 | 0.014815 | 0.000000 | 6.075446 | 6.587300e+17 | 2014 | 2015 | 0.000319 | 0.001646 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.153511 | 0.846489 | 1.0 | 0.153511 |
323 | 4.574048 | 0.000000 | 47.035261 | 6.587300e+17 | 2015 | 2015 | 0.000319 | 0.105755 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.233756 | 0.766244 | 1.0 | 0.233756 |
394 | 4.169295 | 0.000000 | 56.391167 | 6.587300e+17 | 2014 | 2015 | 0.000319 | 0.105755 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.162479 | 0.837521 | 1.0 | 0.162479 |
544 | 0.018346 | 0.000000 | 0.279098 | 6.587300e+17 | 2015 | 2015 | 0.000053 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.151850 | 0.848150 | 1.0 | 0.151850 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18557 | 0.011814 | 0.000000 | 8.152046 | 6.587400e+17 | 2012 | 2015 | 0.000053 | 0.001911 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.169369 | 0.830631 | 1.0 | 0.169369 |
18593 | 1.414132 | 0.000000 | 1.665921 | 6.587400e+17 | 2012 | 2015 | 0.363294 | 0.030049 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.205650 | 0.794350 | 1.0 | 0.205650 |
18616 | 1.339410 | 0.000000 | 10.553680 | 6.587400e+17 | 2012 | 2015 | 0.363294 | 0.015449 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.179468 | 0.820532 | 1.0 | 0.179468 |
18723 | 0.083287 | 0.000184 | 16.298323 | 6.587400e+17 | 2009 | 2015 | 0.000053 | 0.077033 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.197937 | 0.802063 | 1.0 | 0.197937 |
18825 | 0.061193 | 0.000000 | 0.442944 | 6.587300e+17 | 2012 | 2015 | 0.363294 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.283758 | 0.716242 | 1.0 | 0.283758 |
667 rows × 3012 columns
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]
print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'> Index: 667 entries, 100 to 18825 Columns: 3012 entries, favorites_per_day to difference dtypes: float64(3010), int32(2) memory usage: 15.3 MB None <class 'pandas.core.frame.DataFrame'> Index: 126 entries, 2243 to 17766 Columns: 3012 entries, favorites_per_day to difference dtypes: float64(3010), int32(2) memory usage: 2.9 MB None <class 'pandas.core.frame.DataFrame'> Index: 541 entries, 100 to 18825 Columns: 3012 entries, favorites_per_day to difference dtypes: float64(3010), int32(2) memory usage: 12.4 MB None
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
C:\Users\Shahl\AppData\Local\Temp\ipykernel_25768\3000832750.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
#Check if there are any places where there the regression is high and data is very low
df_preprocessed_diff[(df_preprocessed_diff["gender_confidence_pred"] > 0.9) & (df_preprocessed_diff["gender:confidence"] < 0.9)]
favorites_per_day | retweets_per_day | tweets_per_day | tweet_id | profile_created_year | tweet_created_year | tweet_location_encoded | user_timezone_encoded | desc_0 | desc_1 | ... | text_1494 | text_1495 | text_1496 | text_1497 | text_1498 | text_1499 | abs_difference | gender_confidence_pred | gender:confidence | difference | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10 | 0.110045 | 0.0 | 1.104500 | 6.587300e+17 | 2011 | 2015 | 0.003238 | 0.030049 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.224478 | 0.924678 | 0.7002 | -0.224478 |
12 | 0.762325 | 0.0 | 3.072139 | 6.587300e+17 | 2012 | 2015 | 0.363294 | 0.030049 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.258345 | 0.909245 | 0.6509 | -0.258345 |
14 | 3.330464 | 0.0 | 4.416308 | 6.587300e+17 | 2013 | 2015 | 0.000053 | 0.030049 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.259145 | 0.909245 | 0.6501 | -0.259145 |
47 | 0.373984 | 0.0 | 0.464685 | 6.587300e+17 | 2013 | 2015 | 0.000053 | 0.000319 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.254373 | 0.912673 | 0.6583 | -0.254373 |
55 | 0.000000 | 0.0 | 0.839575 | 6.587300e+17 | 2015 | 2015 | 0.000053 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.236981 | 0.905381 | 0.6684 | -0.236981 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18802 | 8.853249 | 0.0 | 5.611950 | 6.587300e+17 | 2011 | 2015 | 0.000106 | 0.024740 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.155387 | 0.920387 | 0.7650 | -0.155387 |
18809 | 0.000396 | 0.0 | 0.000990 | 6.587300e+17 | 2010 | 2015 | 0.363294 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.351268 | 0.927468 | 0.5762 | -0.351268 |
18810 | 0.053395 | 0.0 | 3.333982 | 6.587300e+17 | 2009 | 2015 | 0.000531 | 0.000637 | 0.0 | 0.0 | ... | 0.0 | 0.656343 | 0.0 | 0.0 | 0.0 | 0.0 | 0.362216 | 0.916216 | 0.5540 | -0.362216 |
18824 | 0.000000 | 0.0 | 1.530467 | 6.587400e+17 | 2012 | 2015 | 0.363294 | 0.381344 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.094732 | 0.934432 | 0.8397 | -0.094732 |
18826 | 0.012346 | 0.0 | 0.761481 | 6.587400e+17 | 2013 | 2015 | 0.000425 | 0.077033 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.075179 | 0.922379 | 0.8472 | -0.075179 |
2775 rows × 3012 columns
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(data=df_preprocessed_diff, x='difference', bins=10, kde=False, color='skyblue')
plt.xlabel('Deviation')
plt.ylabel('Frequency')
plt.title('"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
Text(0.5, 1.0, '"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
The dataset seems to be strongly biased towards a high gender confidence meaning that the model likly retains this same bias. Meaning that the model has a tendency to predict a high gender confidence and treat low gender confidences as an outlier. -> A classification model is more suited for the task.
However places where the regression model outputs a significantly lower gender confidence then the dataset (especially when the dataset has a gender confidence of 1) are of particulare interest.
ALSO; worth noting that we only have a large differnce for when the dataset gender conf = 1
FOR PLOTS:
- FEATURE IMPORTANCE AND SHAPLEY VALUES
- SOME BOOSTED TREE PLOTS? PLOT THE RESULTS AND SOME DIST OF THE SUSPICIOUS COLUMNS?
Redo without including text / desc features¶
# Identify columns to drop
columns_to_drop = [col for col in df_preprocessed.columns if col.startswith(('desc_', 'text_'))]
# Drop the identified columns
df_preprocessed_non_text = df_preprocessed.drop(columns=columns_to_drop)
# Output the result
print(df_preprocessed_non_text)
favorites_per_day retweets_per_day tweets_per_day tweet_id \ 0 0.000000 0.000000 28.163452 6.587300e+17 1 0.015564 0.000000 1.710002 6.587300e+17 2 2.148520 0.000279 1.568118 6.587300e+17 3 0.036220 0.000000 0.303568 6.587300e+17 4 9.802469 0.000000 8.264250 6.587300e+17 ... ... ... ... ... 18831 0.090663 0.000000 0.235065 6.587400e+17 18832 0.569067 0.000000 3.062274 6.587300e+17 18833 0.011369 0.000000 6.007049 6.587400e+17 18834 16.340642 0.000000 12.940919 6.587300e+17 18835 0.878971 0.000000 0.767130 6.587400e+17 profile_created_year tweet_created_year tweet_location_encoded \ 0 2013 2015 0.000053 1 2012 2015 0.363294 2 2014 2015 0.000053 3 2009 2015 0.000159 4 2014 2015 0.363294 ... ... ... ... 18831 2015 2015 0.000106 18832 2012 2015 0.000531 18833 2012 2015 0.000106 18834 2012 2015 0.000106 18835 2014 2015 0.363294 user_timezone_encoded 0 0.001699 1 0.127309 2 0.002071 3 0.105755 4 0.381344 ... ... 18831 0.381344 18832 0.381344 18833 0.381344 18834 0.381344 18835 0.381344 [18836 rows x 8 columns]
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.7, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
GradientBoostingRegressor(n_estimators=50, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=50, random_state=42)
from sklearn.metrics import mean_squared_error
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test_non_text, y_pred)
print(f"Mean Squared Error: {mse}")
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02902462759816569 Mean Squared Error: 0.028510429352024164
# Get feature importances from the model
feature_importances = boosted_reg_non_text.feature_importances_
# Get column names from X_train
column_names = X_train.columns
# Create a DataFrame with feature importances and corresponding column names
feature_importance_df = pd.DataFrame({
'Feature': column_names,
'Importance in percentage': feature_importances
})
# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
# Output the DataFrame
print(feature_importance_df)
Feature Importance in percentage 2 tweets_per_day 0.355385 0 favorites_per_day 0.237008 4 profile_created_year 0.221512 7 user_timezone_encoded 0.120650 6 tweet_location_encoded 0.046550 3 tweet_id 0.012980 1 retweets_per_day 0.005914 5 tweet_created_year 0.000000
# Convert DataFrame to a long format suitable for Seaborn
# Create bar plot using Seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
# Add a title
plt.title('Feature Importances')
# Show the plot
plt.show()
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
#adding the dataset gender confidence
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset
print(df_preprocessed_non_text["gender_confidence_pred"].describe())
count 18836.000000 mean 0.901057 std 0.036071 min 0.531954 25% 0.884174 50% 0.907729 75% 0.927289 max 0.983469 Name: gender_confidence_pred, dtype: float64
#Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'> Index: 1059 entries, 11 to 18738 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 favorites_per_day 1059 non-null float64 1 retweets_per_day 1059 non-null float64 2 tweets_per_day 1059 non-null float64 3 tweet_id 1059 non-null float64 4 profile_created_year 1059 non-null int32 5 tweet_created_year 1059 non-null int32 6 tweet_location_encoded 1059 non-null float64 7 user_timezone_encoded 1059 non-null float64 8 gender_confidence_pred 1059 non-null float64 9 gender:confidence 1059 non-null float64 10 difference 1059 non-null float64 dtypes: float64(9), int32(2) memory usage: 91.0 KB None <class 'pandas.core.frame.DataFrame'> Index: 256 entries, 11 to 18330 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 favorites_per_day 256 non-null float64 1 retweets_per_day 256 non-null float64 2 tweets_per_day 256 non-null float64 3 tweet_id 256 non-null float64 4 profile_created_year 256 non-null int32 5 tweet_created_year 256 non-null int32 6 tweet_location_encoded 256 non-null float64 7 user_timezone_encoded 256 non-null float64 8 gender_confidence_pred 256 non-null float64 9 gender:confidence 256 non-null float64 10 difference 256 non-null float64 dtypes: float64(9), int32(2) memory usage: 22.0 KB None <class 'pandas.core.frame.DataFrame'> Index: 803 entries, 15 to 18738 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 favorites_per_day 803 non-null float64 1 retweets_per_day 803 non-null float64 2 tweets_per_day 803 non-null float64 3 tweet_id 803 non-null float64 4 profile_created_year 803 non-null int32 5 tweet_created_year 803 non-null int32 6 tweet_location_encoded 803 non-null float64 7 user_timezone_encoded 803 non-null float64 8 gender_confidence_pred 803 non-null float64 9 gender:confidence 803 non-null float64 10 difference 803 non-null float64 dtypes: float64(9), int32(2) memory usage: 69.0 KB None
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_non_text.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
C:\Users\Shahl\AppData\Local\Temp\ipykernel_25768\3000832750.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(data=df_preprocessed_non_text, x='difference', bins=10, kde=True, color='skyblue')
plt.xlabel('Deviation')
plt.ylabel('Frequency')
plt.title('"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
Text(0.5, 1.0, '"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
Using linear regression¶
A simple linear regression technique is now employed
Regularization is employed (even though we have a very simple low bias model) in order to ensure we do not overfit
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
#print(results.summary())
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_lin_pred)
print(f"Mean Squared Error: {mse}")
mse = mean_squared_error(y, y_lin_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.029905412647208366 Mean Squared Error: 0.02992069210622919
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]
print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'> Index: 0 entries Columns: 3011 entries, favorites_per_day to gender_confidence_pred dtypes: float64(3009), int32(2) memory usage: 0.0 bytes None <class 'pandas.core.frame.DataFrame'> Index: 0 entries Columns: 3011 entries, favorites_per_day to gender_confidence_pred dtypes: float64(3009), int32(2) memory usage: 0.0 bytes None <class 'pandas.core.frame.DataFrame'> Index: 0 entries Columns: 3011 entries, favorites_per_day to gender_confidence_pred dtypes: float64(3009), int32(2) memory usage: 0.0 bytes None
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
[df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
[df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y.to_numpy(), y_lin_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
Without text features¶
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed)
model = sm.OLS(y_train, X_train_lin) # Ordinary least squares (unregularized)
results = model.fit()
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_lin_pred)
print(f"Mean Squared Error: {mse}")
mse = mean_squared_error(y, y_lin_tot_pred)
print(f"Mean Squared Error: {mse}")