In [41]:
import sys
import os
import subprocess
import nltk
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
#from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.metrics import mean_squared_error
In [54]:
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')

# Quick view of the dataset
print('The information of the dataset')
print(df.info())
print('The first few rows of the dataset')
print(df.head())

all_features = df.columns
#Finding features that have a lot of missing data
def find_columns_with_missing(data, columns):
    missing = []
    i = 0
    for col in columns:
        missing.append(data[col].isnull().sum())
        print(f'the {col} has {missing[i]} data missing')
        print(f'the proportion of missing data to the total is {missing[i]/len(data)}')
        if missing[i]/len(data) >= 0.9:
            print(f'The feature to be dropped is {col}')
            data = data.drop(columns=col)
            data_cleaned = data
        i += 1
    return missing, data_cleaned

missing_col, df_cleaned = find_columns_with_missing(df, all_features)
missing_col
print('The information of the cleaned dataset')
print(df_cleaned.info())
print('The first few rows of the cleaned dataset')
print(df_cleaned.head())

# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])

# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])

# Now that we have handled the missing data, you can proceed with further analysis
print('The information of the cleaned dataset')
print(df_cleaned.info())
print('The first few rows of the cleaned dataset')
print(df_cleaned.head())

# Exploratory Data Analysis (EDA)
current_num_features = df.select_dtypes(include=[np.number])

# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
    plt.title(f'Distribution of {feature} by Gender')
    plt.show()

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
    plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
    plt.show()

# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year

# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')

#assuming the data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days

df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']

# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()

# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()

#show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()

# Exploring 'link_color' and 'sidebar_color' features

#Check number of NaN value in  'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()

print(f"Number of NaN values in 'link_color': {link_color_nan_count}")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}")

#Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'the number of link color is {link_color_count}')
print(f'the number of side bar color is {sidebar_color_count}')

# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')

# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")

#top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
#print(top_sidebar_colors)

# Extract top 10 most common sidebar colors 
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()

# Extract top 10 most common link colors 
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()

# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned, 
              order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned, 
              order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)], 
                palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)], 
                palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
       'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
       ]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)

# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']

#categorise types of features

#numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()

#categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)

freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)

df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()

#gender features
#encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})

# Check for unique values in the 'gender' column after replacement
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()

# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)

# Function to convert hex to RGB
def hex_to_rgb(hex_color):
    # Remove the '#' if it exists
    hex_color = hex_color.lstrip('#')
    
    # Convert hex to integer and split into RGB components
    return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]

# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))

rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)

#Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])

#keep the gender confidence preprocessed to be able to use it in regression task
preprocessed_gender_conf  = df_preprocessed["gender:confidence"].copy()

#Check if all required features are there
print(f'All features that will be used are {df_preprocessed.columns.tolist()}')

# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
#print(f'All current numerical features are {numerical_features.columns.tolist()}')

print('After all, here is the information of the dataset')
print(df_preprocessed.info())

# NLP Processing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
#df_preprocessed['name'].fillna('', inplace=True)

#Check the text features if they still contain NaN
print(df_preprocessed.select_dtypes(include=[object]))


# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    #Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removes punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)  
    #Tokenize the text
    tokens = word_tokenize(text)
    #Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    #Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    #Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
#df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))

# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())

#Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])

#Check the preprocessed dataset in the present
print('The current information of pre-processed dataset before text preprocessing')
print(df_preprocessed.info())


# Initialize TFIDF vectorizer for text features
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')

# Apply TF-IDF on 'description', 'text', 'name' columns

tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
#tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()

# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
#tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])

# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)

#Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])

df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)

df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)

print(df_preprocessed.head())
The information of the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  profile_yn_gold        50 non-null     object 
 16  profileimage           20050 non-null  object 
 17  retweet_count          20050 non-null  int64  
 18  sidebar_color          20050 non-null  object 
 19  text                   20050 non-null  object 
 20  tweet_coord            159 non-null    object 
 21  tweet_count            20050 non-null  int64  
 22  tweet_created          20050 non-null  object 
 23  tweet_id               20050 non-null  float64
 24  tweet_location         12565 non-null  object 
 25  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
None
The first few rows of the dataset
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence profile_yn  profile_yn:confidence  \
0    male             1.0000        yes                    1.0   
1    male             1.0000        yes                    1.0   
2    male             0.6625        yes                    1.0   
3    male             1.0000        yes                    1.0   
4  female             1.0000        yes                    1.0   

          created  ...                                       profileimage  \
0    12/5/13 1:48  ...  https://pbs.twimg.com/profile_images/414342229...   
1   10/1/12 13:51  ...  https://pbs.twimg.com/profile_images/539604221...   
2  11/28/14 11:30  ...  https://pbs.twimg.com/profile_images/657330418...   
3   6/11/09 22:39  ...  https://pbs.twimg.com/profile_images/259703936...   
4   4/16/14 13:23  ...  https://pbs.twimg.com/profile_images/564094871...   

   retweet_count sidebar_color  \
0              0        FFFFFF   
1              0        C0DEED   
2              1        C0DEED   
3              0        C0DEED   
4              0             0   

                                                text tweet_coord tweet_count  \
0  Robbie E Responds To Critics After Win Against...         NaN      110964   
1  ‰ÛÏIt felt like they were my friends and I was...         NaN        7471   
2  i absolutely adore when louis starts the songs...         NaN        5617   
3  Hi @JordanSpieth - Looking at the url - do you...         NaN        1693   
4  Watching Neighbours on Sky+ catching up with t...         NaN       31462   

    tweet_created      tweet_id   tweet_location               user_timezone  
0  10/26/15 12:40  6.587300e+17  main; @Kan1shk3                     Chennai  
1  10/26/15 12:40  6.587300e+17              NaN  Eastern Time (US & Canada)  
2  10/26/15 12:40  6.587300e+17           clcncl                    Belgrade  
3  10/26/15 12:40  6.587300e+17    Palo Alto, CA  Pacific Time (US & Canada)  
4  10/26/15 12:40  6.587300e+17              NaN                         NaN  

[5 rows x 26 columns]
the _unit_id has 0 data missing
the proportion of missing data to the total is 0.0
the _golden has 0 data missing
the proportion of missing data to the total is 0.0
the _unit_state has 0 data missing
the proportion of missing data to the total is 0.0
the _trusted_judgments has 0 data missing
the proportion of missing data to the total is 0.0
the _last_judgment_at has 50 data missing
the proportion of missing data to the total is 0.0024937655860349127
the gender has 97 data missing
the proportion of missing data to the total is 0.00483790523690773
the gender:confidence has 26 data missing
the proportion of missing data to the total is 0.0012967581047381546
the profile_yn has 0 data missing
the proportion of missing data to the total is 0.0
the profile_yn:confidence has 0 data missing
the proportion of missing data to the total is 0.0
the created has 0 data missing
the proportion of missing data to the total is 0.0
the description has 3744 data missing
the proportion of missing data to the total is 0.18673316708229426
the fav_number has 0 data missing
the proportion of missing data to the total is 0.0
the gender_gold has 20000 data missing
the proportion of missing data to the total is 0.9975062344139651
The feature to be dropped is gender_gold
the link_color has 0 data missing
the proportion of missing data to the total is 0.0
the name has 0 data missing
the proportion of missing data to the total is 0.0
the profile_yn_gold has 20000 data missing
the proportion of missing data to the total is 0.9975062344139651
The feature to be dropped is profile_yn_gold
the profileimage has 0 data missing
the proportion of missing data to the total is 0.0
the retweet_count has 0 data missing
the proportion of missing data to the total is 0.0
the sidebar_color has 0 data missing
the proportion of missing data to the total is 0.0
the text has 0 data missing
the proportion of missing data to the total is 0.0
the tweet_coord has 19891 data missing
the proportion of missing data to the total is 0.992069825436409
The feature to be dropped is tweet_coord
the tweet_count has 0 data missing
the proportion of missing data to the total is 0.0
the tweet_created has 0 data missing
the proportion of missing data to the total is 0.0
the tweet_id has 0 data missing
the proportion of missing data to the total is 0.0
the tweet_location has 7485 data missing
the proportion of missing data to the total is 0.3733167082294264
the user_timezone has 7798 data missing
the proportion of missing data to the total is 0.388927680798005
The information of the cleaned dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  link_color             20050 non-null  object 
 13  name                   20050 non-null  object 
 14  profileimage           20050 non-null  object 
 15  retweet_count          20050 non-null  int64  
 16  sidebar_color          20050 non-null  object 
 17  text                   20050 non-null  object 
 18  tweet_count            20050 non-null  int64  
 19  tweet_created          20050 non-null  object 
 20  tweet_id               20050 non-null  float64
 21  tweet_location         12565 non-null  object 
 22  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(14)
memory usage: 3.4+ MB
None
The first few rows of the cleaned dataset
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence profile_yn  profile_yn:confidence  \
0    male             1.0000        yes                    1.0   
1    male             1.0000        yes                    1.0   
2    male             0.6625        yes                    1.0   
3    male             1.0000        yes                    1.0   
4  female             1.0000        yes                    1.0   

          created  ...            name  \
0    12/5/13 1:48  ...         sheezy0   
1   10/1/12 13:51  ...     DavdBurnett   
2  11/28/14 11:30  ...  lwtprettylaugh   
3   6/11/09 22:39  ...     douggarland   
4   4/16/14 13:23  ...    WilfordGemma   

                                        profileimage retweet_count  \
0  https://pbs.twimg.com/profile_images/414342229...             0   
1  https://pbs.twimg.com/profile_images/539604221...             0   
2  https://pbs.twimg.com/profile_images/657330418...             1   
3  https://pbs.twimg.com/profile_images/259703936...             0   
4  https://pbs.twimg.com/profile_images/564094871...             0   

  sidebar_color                                               text  \
0        FFFFFF  Robbie E Responds To Critics After Win Against...   
1        C0DEED  ‰ÛÏIt felt like they were my friends and I was...   
2        C0DEED  i absolutely adore when louis starts the songs...   
3        C0DEED  Hi @JordanSpieth - Looking at the url - do you...   
4             0  Watching Neighbours on Sky+ catching up with t...   

   tweet_count   tweet_created      tweet_id   tweet_location  \
0       110964  10/26/15 12:40  6.587300e+17  main; @Kan1shk3   
1         7471  10/26/15 12:40  6.587300e+17              NaN   
2         5617  10/26/15 12:40  6.587300e+17           clcncl   
3         1693  10/26/15 12:40  6.587300e+17    Palo Alto, CA   
4        31462  10/26/15 12:40  6.587300e+17              NaN   

                user_timezone  
0                     Chennai  
1  Eastern Time (US & Canada)  
2                    Belgrade  
3  Pacific Time (US & Canada)  
4                         NaN  

[5 rows x 23 columns]
The information of the cleaned dataset
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               19953 non-null  int64  
 1   _golden                19953 non-null  bool   
 2   _unit_state            19953 non-null  object 
 3   _trusted_judgments     19953 non-null  int64  
 4   _last_judgment_at      19903 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      19953 non-null  float64
 7   profile_yn:confidence  19953 non-null  float64
 8   created                19953 non-null  object 
 9   description            16224 non-null  object 
 10  fav_number             19953 non-null  int64  
 11  link_color             19953 non-null  object 
 12  name                   19953 non-null  object 
 13  profileimage           19953 non-null  object 
 14  retweet_count          19953 non-null  int64  
 15  sidebar_color          19953 non-null  object 
 16  text                   19953 non-null  object 
 17  tweet_count            19953 non-null  int64  
 18  tweet_created          19953 non-null  object 
 19  tweet_id               19953 non-null  float64
 20  tweet_location         12510 non-null  object 
 21  user_timezone          12185 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
None
The first few rows of the cleaned dataset
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence  profile_yn:confidence         created  \
0    male             1.0000                    1.0    12/5/13 1:48   
1    male             1.0000                    1.0   10/1/12 13:51   
2    male             0.6625                    1.0  11/28/14 11:30   
3    male             1.0000                    1.0   6/11/09 22:39   
4  female             1.0000                    1.0   4/16/14 13:23   

                                         description  ...            name  \
0                              i sing my own rhythm.  ...         sheezy0   
1  I'm the author of novels filled with family dr...  ...     DavdBurnett   
2                louis whining and squealing and all  ...  lwtprettylaugh   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...  ...     douggarland   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...  ...    WilfordGemma   

                                        profileimage retweet_count  \
0  https://pbs.twimg.com/profile_images/414342229...             0   
1  https://pbs.twimg.com/profile_images/539604221...             0   
2  https://pbs.twimg.com/profile_images/657330418...             1   
3  https://pbs.twimg.com/profile_images/259703936...             0   
4  https://pbs.twimg.com/profile_images/564094871...             0   

  sidebar_color                                               text  \
0        FFFFFF  Robbie E Responds To Critics After Win Against...   
1        C0DEED  ‰ÛÏIt felt like they were my friends and I was...   
2        C0DEED  i absolutely adore when louis starts the songs...   
3        C0DEED  Hi @JordanSpieth - Looking at the url - do you...   
4             0  Watching Neighbours on Sky+ catching up with t...   

  tweet_count   tweet_created      tweet_id   tweet_location  \
0      110964  10/26/15 12:40  6.587300e+17  main; @Kan1shk3   
1        7471  10/26/15 12:40  6.587300e+17              NaN   
2        5617  10/26/15 12:40  6.587300e+17           clcncl   
3        1693  10/26/15 12:40  6.587300e+17    Palo Alto, CA   
4       31462  10/26/15 12:40  6.587300e+17              NaN   

                user_timezone  
0                     Chennai  
1  Eastern Time (US & Canada)  
2                    Belgrade  
3  Pacific Time (US & Canada)  
4                         NaN  

[5 rows x 22 columns]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:77: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:78: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:81: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2151091598.py:82: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Number of NaN values in 'link_color': 0
Number of NaN values in 'sidebar_color': 0
the number of link color is 2986
the number of side bar color is 559
Number of NaN values in 'link_color': 0
Number of NaN values in 'sidebar_color': 0
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
[0 1 2]
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   link_color              18836 non-null  object 
 5   retweets_per_day        18836 non-null  float64
 6   sidebar_color           18836 non-null  object 
 7   text                    18836 non-null  object 
 8   tweets_per_day          18836 non-null  float64
 9   user_timezone           18836 non-null  object 
 10  tweet_location          18836 non-null  object 
 11  profile_created_year    18836 non-null  int32  
 12  tweet_created_year      18836 non-null  int32  
 13  tweet_location_encoded  18836 non-null  float64
 14  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(6)
memory usage: 2.2+ MB
None
No description has been provided for this image
All features that will be used are ['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded']
After all, here is the information of the dataset
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   retweets_per_day        18836 non-null  float64
 5   text                    18836 non-null  object 
 6   tweets_per_day          18836 non-null  float64
 7   profile_created_year    18836 non-null  int32  
 8   tweet_created_year      18836 non-null  int32  
 9   tweet_location_encoded  18836 non-null  float64
 10  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(2)
memory usage: 1.6+ MB
None
                                             description  \
0                                  i sing my own rhythm.   
1      I'm the author of novels filled with family dr...   
2                    louis whining and squealing and all   
3      Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4      Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   
...                                                  ...   
20045                                               (rp)   
20046  Whatever you like, it's not a problem at all. ...   
20047  #TeamBarcelona ..You look lost so you should f...   
20048  Anti-statist; I homeschool my kids. Aspiring t...   
20049                     Teamwork makes the dream work.   

                                                    text  
0      Robbie E Responds To Critics After Win Against...  
1      ‰ÛÏIt felt like they were my friends and I was...  
2      i absolutely adore when louis starts the songs...  
3      Hi @JordanSpieth - Looking at the url - do you...  
4      Watching Neighbours on Sky+ catching up with t...  
...                                                  ...  
20045  @lookupondeath ...Fine, and I'll drink tea too...  
20046  Greg Hardy you a good player and all but don't...  
20047  You can miss people and still never want to se...  
20048  @bitemyapp i had noticed your tendency to pee ...  
20049  I think for my APUSH creative project I'm goin...  

[18836 rows x 2 columns]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shahl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shahl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shahl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shahl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
                                         description  \
0                              i sing my own rhythm.   
1  I'm the author of novels filled with family dr...   
2                louis whining and squealing and all   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   

                                 cleaned_description  \
0                                        sing rhythm   
1        im author novel filled family drama romance   
2                            louis whining squealing   
3  mobile guy er shazam google kleiner perkins ya...   
4  ricky wilson best frontmankaiser chief best ba...   

                                                text  \
0  Robbie E Responds To Critics After Win Against...   
1  ‰ÛÏIt felt like they were my friends and I was...   
2  i absolutely adore when louis starts the songs...   
3  Hi @JordanSpieth - Looking at the url - do you...   
4  Watching Neighbours on Sky+ catching up with t...   

                                        cleaned_text  
0  robbie e responds critic win eddie edward worl...  
1  felt like friend living story httpstcoarngeyhn...  
2  absolutely adore louis start song hit hard fee...  
3  hi jordanspieth looking url use ifttt dont typ...  
4    watching neighbour sky catching neighbs xxx xxx  
The current information of pre-processed dataset before text preprocessing
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   favorites_per_day       18836 non-null  float64
 3   retweets_per_day        18836 non-null  float64
 4   tweets_per_day          18836 non-null  float64
 5   profile_created_year    18836 non-null  int32  
 6   tweet_created_year      18836 non-null  int32  
 7   tweet_location_encoded  18836 non-null  float64
 8   user_timezone_encoded   18836 non-null  float64
 9   cleaned_description     18836 non-null  object 
 10  cleaned_text            18836 non-null  object 
dtypes: float64(6), int32(2), int64(1), object(2)
memory usage: 1.6+ MB
None
   gender  gender:confidence  favorites_per_day  retweets_per_day  \
0       0             1.0000           0.000000          0.000000   
1       0             1.0000           0.015561          0.000000   
2       0             0.6625           2.148520          0.000279   
3       0             1.0000           0.036220          0.000000   
4       1             1.0000           9.799895          0.000000   

   tweets_per_day  profile_created_year  tweet_created_year  \
0       28.163452                  2013                2015   
1        1.709611                  2012                2015   
2        1.568118                  2014                2015   
3        0.303568                  2009                2015   
4        8.262080                  2014                2015   

   tweet_location_encoded  user_timezone_encoded  desc_0  ...  link_B  \
0                0.000053               0.001699     0.0  ...     194   
1                0.363294               0.127309     0.0  ...     180   
2                0.000053               0.002071     0.0  ...     194   
3                0.000159               0.105755     0.0  ...     180   
4                0.363294               0.381344     0.0  ...     217   

   sidebar_R  sidebar_G  sidebar_B  link_R  link_G  link_B  sidebar_R  \
0        255        255        255       8     194     194        255   
1        192        222        237       0     132     180        192   
2        192        222        237     171     184     194        192   
3        192        222        237       0     132     180        192   
4          0          0          0      59     148     217          0   

   sidebar_G  sidebar_B  
0        255        255  
1        222        237  
2        222        237  
3        222        237  
4          0          0  

[5 rows x 3021 columns]
In [44]:
df_preprocessed.head()
Out[44]:
gender gender:confidence favorites_per_day retweets_per_day tweets_per_day profile_created_year tweet_created_year tweet_location_encoded user_timezone_encoded desc_0 ... link_B sidebar_R sidebar_G sidebar_B link_R link_G link_B sidebar_R sidebar_G sidebar_B
0 0 1.0000 0.000000 0.000000 28.163452 2013 2015 0.000053 0.001699 0.0 ... 194 255 255 255 8 194 194 255 255 255
1 0 1.0000 0.015561 0.000000 1.709611 2012 2015 0.363294 0.127309 0.0 ... 180 192 222 237 0 132 180 192 222 237
2 0 0.6625 2.148520 0.000279 1.568118 2014 2015 0.000053 0.002071 0.0 ... 194 192 222 237 171 184 194 192 222 237
3 0 1.0000 0.036220 0.000000 0.303568 2009 2015 0.000159 0.105755 0.0 ... 180 192 222 237 0 132 180 192 222 237
4 1 1.0000 9.799895 0.000000 8.262080 2014 2015 0.363294 0.381344 0.0 ... 217 0 0 0 59 148 217 0 0 0

5 rows × 3021 columns

In [85]:
## Complete py code ¨
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

#finish preprocessing for regression
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)

# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)

print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for boosted regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()

#FEATURE IMPORTANCE
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
print("desc_ column indices:", desc_columns)
print("text_ column indices:", text_columns)
print("desc_ array:\n", desc_array)
print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
    col_name = df_preprocessed_reg.columns[i]
    new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)

#Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.title('Feature Importances for boosted regression tree with vectorized text / desc features')
plt.show()


#preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset

#filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]

#plotting these columns

def scatterplot_mistaken_points(misclassified_df, X_train):
    # Edit misclassified_df to include 'in X_train'
    misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
    # Create subsets for the two plots
    df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
    df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
    # Set up the matplotlib figure with subplots
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    # Plot 1: Points in X_train
    sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
    axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
                [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[0].set_xlabel('Dataset Gender Confidence')
    axes[0].set_ylabel('Predicted Gender Confidence')
    axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')
    # Plot 2: Points not in X_train
    sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
    axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
                [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[1].set_xlabel('Dataset Gender Confidence')
    axes[1].set_ylabel('Predicted Gender Confidence')
    axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')
    plt.tight_layout()
    plt.show()

def scatter_plot(y, y_tot_pred, model):
    #Plotting more results results
    plt.figure(figsize=(8, 6))
    plt.scatter(y, y_tot_pred, alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
    plt.xlabel('Dataset Gender Confidence')
    plt.ylabel('Predicted Gender Confidence')
    plt.title('Predicted gender confidence vs. Dataset gender confidence' + model)
    plt.show()

scatterplot_mistaken_points(misclassified_df, X_train)
scatter_plot(y, y_tot_pred, "for boosted regression tree with vectorized text / desc features")

#==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
print(df_preprocessed_non_text)

boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)

print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for boosted regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()

# Get feature importances and plot from the model
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
    'Feature': column_names,
    'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importances for boosted regression tree without vectorized text / desc features ')
plt.show()

#adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset

#Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text)
scatter_plot(y, y_tot_pred, "for boosted regression tree without vectorized text / desc features")

#====================================Analyzing with a linear regression (Least Squares Implementation)====================
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)

print(f"Mean Squared Error train: {mse_train}")
print(f"Mean Squared Error test: {mse_test}")
print(f"Mean Squared Error total: {mse_total}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Mean Squared Error for Train, Test, and Total for linear regression tree with vectorized text / desc features')
plt.xlabel('Dataset')
plt.ylabel('Mean Squared Error')
plt.show()

#final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred


#identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]





scatter_plot(y, y_lin_tot_pred, "for linear regression with vectorized text / description features")

# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)
# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
             [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')

# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
             [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')

# Adjust layout
plt.tight_layout()


#================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]

scatterplot_mistaken_points(common_df, X_train_lin)
Mean Squared Error train: 0.026508391442518143
Mean Squared Error test: 0.028929165530200335
Mean Squared Error total: 0.027960907302519423
No description has been provided for this image
desc_ column indices: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506]
text_ column indices: [1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006]
desc_ array:
 [0. 0. 0. ... 0. 0. 0.]
text_ array:
 [0.         0.         0.         ... 0.00592981 0.         0.        ]
       desc      text  favorites_per_day  retweets_per_day  tweets_per_day  \
0  0.301743  0.354858            0.02181          0.000411        0.118526   

   profile_created_year  tweet_created_year  tweet_location_encoded  \
0              0.153808                 0.0                0.010426   

   user_timezone_encoded        link_R    link_G    link_B  sidebar_R  \
0                0.00819  2.804551e-08  0.010901  0.000417   0.000347   

   sidebar_G  sidebar_B  
0   0.008443   0.002519  
No description has been provided for this image
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:101: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
No description has been provided for this image
No description has been provided for this image
       favorites_per_day  retweets_per_day  tweets_per_day  \
0               0.000000          0.000000       28.163452   
1               0.015561          0.000000        1.709611   
2               2.148520          0.000279        1.568118   
3               0.036220          0.000000        0.303568   
4               9.799895          0.000000        8.262080   
...                  ...               ...             ...   
18831           0.090663          0.000000        0.235065   
18832           0.569067          0.000000        3.062274   
18833           0.011369          0.000000        6.007049   
18834          16.340642          0.000000       12.940919   
18835           0.878971          0.000000        0.767130   

       profile_created_year  tweet_created_year  tweet_location_encoded  \
0                      2013                2015                0.000053   
1                      2012                2015                0.363294   
2                      2014                2015                0.000053   
3                      2009                2015                0.000159   
4                      2014                2015                0.363294   
...                     ...                 ...                     ...   
18831                  2015                2015                0.000106   
18832                  2012                2015                0.000531   
18833                  2012                2015                0.000106   
18834                  2012                2015                0.000106   
18835                  2014                2015                0.363294   

       user_timezone_encoded  link_R  link_G  link_B  sidebar_R  sidebar_G  \
0                   0.001699       8     194     194        255        255   
1                   0.127309       0     132     180        192        222   
2                   0.002071     171     184     194        192        222   
3                   0.105755       0     132     180        192        222   
4                   0.381344      59     148     217          0          0   
...                      ...     ...     ...     ...        ...        ...   
18831               0.381344       0     132     180        192        222   
18832               0.381344     207     185      41          0          0   
18833               0.381344       0     132     180        192        222   
18834               0.381344     146     102     204          0          0   
18835               0.381344       0     132     180        192        222   

       sidebar_B  link_R  link_G  link_B  sidebar_R  sidebar_G  sidebar_B  
0            255       8     194     194        255        255        255  
1            237       0     132     180        192        222        237  
2            237     171     184     194        192        222        237  
3            237       0     132     180        192        222        237  
4              0      59     148     217          0          0          0  
...          ...     ...     ...     ...        ...        ...        ...  
18831        237       0     132     180        192        222        237  
18832          0     207     185      41          0          0          0  
18833        237       0     132     180        192        222        237  
18834          0     146     102     204          0          0          0  
18835        237       0     132     180        192        222        237  

[18836 rows x 19 columns]
Mean Squared Error train: 0.027405575319738522
Mean Squared Error test: 0.029051941898659762
Mean Squared Error total: 0.027960907302519423
No description has been provided for this image
No description has been provided for this image
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:101: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
No description has been provided for this image
No description has been provided for this image
Mean Squared Error train: 0.016637954003891314
Mean Squared Error test: 0.04996836851802664
Mean Squared Error total: 0.03663691051476185
No description has been provided for this image
No description has been provided for this image
C:\Users\Shahl\AppData\Local\Temp\ipykernel_19484\2331525771.py:247: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)
No description has been provided for this image
No description has been provided for this image
In [78]:
df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.01) & (df_preprocessed_lin["gender_confidence_pred"] < 0.5)]
Out[78]:
favorites_per_day retweets_per_day tweets_per_day profile_created_year tweet_created_year tweet_location_encoded user_timezone_encoded desc_0 desc_1 desc_2 ... sidebar_G sidebar_B link_R link_G link_B sidebar_R sidebar_G sidebar_B difference gender_confidence_pred
50 0.003124 0.000000 37.098412 2014 2015 0.000053 0.009397 0.0 0.0 0.0 ... 222 237 0 132 180 192 222 237 0.633508 0.366492
795 0.257622 0.000000 4.290771 2011 2015 0.000053 0.001699 0.0 0.0 0.0 ... 255 255 35 8 143 255 255 255 0.594003 0.405997
980 4.412043 0.000215 13.829892 2011 2015 0.000053 0.002230 0.0 0.0 0.0 ... 255 255 87 21 21 255 255 255 0.610040 0.389960
1189 0.091177 0.000000 0.173798 2015 2015 0.000053 0.127309 0.0 0.0 0.0 ... 157 94 208 43 85 130 157 94 0.505995 0.494005
1330 0.048406 0.000000 4.700256 2012 2015 0.000053 0.065831 0.0 0.0 0.0 ... 238 238 221 46 68 238 238 238 0.637793 0.362207
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18139 0.000000 0.000000 23.618518 2013 2015 0.363294 0.000319 0.0 0.0 0.0 ... 222 237 0 132 180 192 222 237 0.532139 0.467861
18229 0.000604 0.000000 33.945350 2015 2015 0.000106 0.105755 0.0 0.0 0.0 ... 222 237 0 132 180 192 222 237 0.193740 0.471160
18230 0.000602 0.000000 32.000602 2015 2015 0.363294 0.381344 0.0 0.0 0.0 ... 222 237 0 132 180 192 222 237 0.210192 0.470708
18474 0.000000 0.000000 0.240339 2015 2015 0.000053 0.381344 0.0 0.0 0.0 ... 222 237 0 132 180 192 222 237 0.537172 0.462828
18550 0.191693 0.000000 15.421903 2009 2015 0.000053 0.024740 0.0 0.0 0.0 ... 0 0 0 0 0 0 0 0 0.551622 0.448378

98 rows × 3021 columns

Regression Models¶

In [46]:
#finish preprocessing for regression
df_preprocessed = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed = df_preprocessed.drop(['gender', "gender:confidence"], axis=1)
In [6]:
#set our regression target values
y = preprocessed_gender_conf.reset_index(drop=True)
In [47]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed, y, test_size=0.7, random_state=42)

boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)
Out[47]:
GradientBoostingRegressor(n_estimators=50, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=50, random_state=42)

Analysing results¶

In [48]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = boosted_reg.predict(X_test)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.028950573206351155
In [49]:
y_tot_pred = boosted_reg.predict(df_preprocessed)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02811897878896962
In [50]:
#checking feature importance

# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed.columns) if col.startswith('text_')]

# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]

# Output the results
print("desc_ column indices:", desc_columns)
print("text_ column indices:", text_columns)
print("desc_ array:\n", desc_array)
print("text_ array:\n", text_array)

# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])

# Create a new DataFrame
new_data = {}

# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_

# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed.columns)) if i not in desc_columns and i not in text_columns]

for i in other_columns:
    col_name = df_preprocessed.columns[i]
    new_data[col_name] = [boosted_reg.feature_importances_[i]]

# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)

# Output the results
print(feature_importance)
desc_ column indices: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506]
text_ column indices: [1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006]
desc_ array:
 [0. 0. 0. ... 0. 0. 0.]
text_ array:
 [0.         0.         0.         ... 0.01125915 0.         0.        ]
       desc      text  favorites_per_day  retweets_per_day  tweets_per_day  \
0  0.310651  0.362809           0.025296               0.0        0.115279   

   profile_created_year  tweet_created_year  tweet_location_encoded  \
0              0.130858                 0.0                0.002709   

   user_timezone_encoded    link_R   link_G    link_B     sidebar_R  \
0               0.009664  0.011811  0.02111  0.000145  1.970672e-08   

   sidebar_G  sidebar_B  
0   0.002433   0.000798  
In [51]:
# Convert DataFrame to a long format suitable for Seaborn
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')

# Create bar plot using Seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')

# Add a title
plt.title('Feature Importances')

# Show the plot
plt.show()
No description has been provided for this image
In [12]:
df_preprocessed_diff = df_preprocessed.copy()
df_preprocessed_diff['abs_difference'] = (abs(y.to_numpy() - y_tot_pred))
In [13]:
df_preprocessed_diff["abs_difference"].describe()
Out[13]:
count    18836.000000
mean         0.138667
std          0.094311
min          0.001569
25%          0.079613
50%          0.094619
75%          0.168580
max          0.600668
Name: abs_difference, dtype: float64
In [14]:
# Find samples where the abs difference os over .2
misclassified_df = df_preprocessed_diff[df_preprocessed_diff["abs_difference"] > 0.2]

#Further filter based on indices in X_train.index
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]

print(non_train_misclassify["abs_difference"].describe())
print(train_misclassify["abs_difference"].describe())
print(misclassified_df["abs_difference"].describe())
count    1160.000000
mean        0.278931
std         0.096486
min         0.200279
25%         0.230187
50%         0.245285
75%         0.262481
max         0.591468
Name: abs_difference, dtype: float64
count    2782.000000
mean        0.288046
std         0.112172
min         0.200085
25%         0.231173
50%         0.246618
75%         0.264480
max         0.600668
Name: abs_difference, dtype: float64
count    3942.000000
mean        0.285364
std         0.107861
min         0.200085
25%         0.230767
50%         0.246089
75%         0.263885
max         0.600668
Name: abs_difference, dtype: float64

Check where the gender confidence in regression is below a threshold¶

Notes¶

Instead of taking the difference take everywhere where the model has much lower gender confidence then the dataset

WHAT WE WANT IS PLACES WHERE REGRESSION CONF IS LOWER THEN NON REGRESSION CONDFIDENCE

In [15]:
#adding the prediction to the dataset
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred

#adding the dataset gender confidence
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset

print(df_preprocessed_diff["gender_confidence_pred"].describe())
print(y.describe())
count    18836.000000
mean         0.900914
std          0.030007
min          0.649431
25%          0.895038
50%          0.909245
75%          0.920387
max          0.947084
Name: gender_confidence_pred, dtype: float64
count    18836.000000
mean         0.900997
std          0.172980
min          0.314000
25%          0.686475
50%          1.000000
75%          1.000000
max          1.000000
Name: gender:confidence, dtype: float64
In [16]:
df_preprocessed_diff["difference"] = y.to_numpy() - y_tot_pred
df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.15) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
Out[16]:
favorites_per_day retweets_per_day tweets_per_day tweet_id profile_created_year tweet_created_year tweet_location_encoded user_timezone_encoded desc_0 desc_1 ... text_1494 text_1495 text_1496 text_1497 text_1498 text_1499 abs_difference gender_confidence_pred gender:confidence difference
100 0.079670 0.000000 0.269384 6.587300e+17 2014 2015 0.363294 0.381344 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.163436 0.836564 1.0 0.163436
102 0.014815 0.000000 6.075446 6.587300e+17 2014 2015 0.000319 0.001646 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.153511 0.846489 1.0 0.153511
323 4.574048 0.000000 47.035261 6.587300e+17 2015 2015 0.000319 0.105755 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.233756 0.766244 1.0 0.233756
394 4.169295 0.000000 56.391167 6.587300e+17 2014 2015 0.000319 0.105755 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.162479 0.837521 1.0 0.162479
544 0.018346 0.000000 0.279098 6.587300e+17 2015 2015 0.000053 0.381344 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.151850 0.848150 1.0 0.151850
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18557 0.011814 0.000000 8.152046 6.587400e+17 2012 2015 0.000053 0.001911 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.169369 0.830631 1.0 0.169369
18593 1.414132 0.000000 1.665921 6.587400e+17 2012 2015 0.363294 0.030049 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.205650 0.794350 1.0 0.205650
18616 1.339410 0.000000 10.553680 6.587400e+17 2012 2015 0.363294 0.015449 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.179468 0.820532 1.0 0.179468
18723 0.083287 0.000184 16.298323 6.587400e+17 2009 2015 0.000053 0.077033 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.197937 0.802063 1.0 0.197937
18825 0.061193 0.000000 0.442944 6.587300e+17 2012 2015 0.363294 0.381344 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.283758 0.716242 1.0 0.283758

667 rows × 3012 columns

In [17]:
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]

non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]


print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'>
Index: 667 entries, 100 to 18825
Columns: 3012 entries, favorites_per_day to difference
dtypes: float64(3010), int32(2)
memory usage: 15.3 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 126 entries, 2243 to 17766
Columns: 3012 entries, favorites_per_day to difference
dtypes: float64(3010), int32(2)
memory usage: 2.9 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 100 to 18825
Columns: 3012 entries, favorites_per_day to difference
dtypes: float64(3010), int32(2)
memory usage: 12.4 MB
None
In [18]:
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)

# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
             [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')

# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
             [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
C:\Users\Shahl\AppData\Local\Temp\ipykernel_25768\3000832750.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
No description has been provided for this image
In [19]:
#Check if there are any places where there the regression is high and data is very low
df_preprocessed_diff[(df_preprocessed_diff["gender_confidence_pred"] > 0.9) & (df_preprocessed_diff["gender:confidence"] < 0.9)]
Out[19]:
favorites_per_day retweets_per_day tweets_per_day tweet_id profile_created_year tweet_created_year tweet_location_encoded user_timezone_encoded desc_0 desc_1 ... text_1494 text_1495 text_1496 text_1497 text_1498 text_1499 abs_difference gender_confidence_pred gender:confidence difference
10 0.110045 0.0 1.104500 6.587300e+17 2011 2015 0.003238 0.030049 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.224478 0.924678 0.7002 -0.224478
12 0.762325 0.0 3.072139 6.587300e+17 2012 2015 0.363294 0.030049 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.258345 0.909245 0.6509 -0.258345
14 3.330464 0.0 4.416308 6.587300e+17 2013 2015 0.000053 0.030049 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.259145 0.909245 0.6501 -0.259145
47 0.373984 0.0 0.464685 6.587300e+17 2013 2015 0.000053 0.000319 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.254373 0.912673 0.6583 -0.254373
55 0.000000 0.0 0.839575 6.587300e+17 2015 2015 0.000053 0.381344 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.236981 0.905381 0.6684 -0.236981
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18802 8.853249 0.0 5.611950 6.587300e+17 2011 2015 0.000106 0.024740 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.155387 0.920387 0.7650 -0.155387
18809 0.000396 0.0 0.000990 6.587300e+17 2010 2015 0.363294 0.381344 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.351268 0.927468 0.5762 -0.351268
18810 0.053395 0.0 3.333982 6.587300e+17 2009 2015 0.000531 0.000637 0.0 0.0 ... 0.0 0.656343 0.0 0.0 0.0 0.0 0.362216 0.916216 0.5540 -0.362216
18824 0.000000 0.0 1.530467 6.587400e+17 2012 2015 0.363294 0.381344 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.094732 0.934432 0.8397 -0.094732
18826 0.012346 0.0 0.761481 6.587400e+17 2013 2015 0.000425 0.077033 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.075179 0.922379 0.8472 -0.075179

2775 rows × 3012 columns

In [20]:
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
No description has been provided for this image
In [21]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df_preprocessed_diff, x='difference', bins=10, kde=False, color='skyblue')
plt.xlabel('Deviation')
plt.ylabel('Frequency')
plt.title('"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
Out[21]:
Text(0.5, 1.0, '"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
No description has been provided for this image

The dataset seems to be strongly biased towards a high gender confidence meaning that the model likly retains this same bias. Meaning that the model has a tendency to predict a high gender confidence and treat low gender confidences as an outlier. -> A classification model is more suited for the task.

However places where the regression model outputs a significantly lower gender confidence then the dataset (especially when the dataset has a gender confidence of 1) are of particulare interest.

ALSO; worth noting that we only have a large differnce for when the dataset gender conf = 1

FOR PLOTS:

  1. FEATURE IMPORTANCE AND SHAPLEY VALUES
  2. SOME BOOSTED TREE PLOTS? PLOT THE RESULTS AND SOME DIST OF THE SUSPICIOUS COLUMNS?

Redo without including text / desc features¶

In [22]:
# Identify columns to drop
columns_to_drop = [col for col in df_preprocessed.columns if col.startswith(('desc_', 'text_'))]

# Drop the identified columns
df_preprocessed_non_text = df_preprocessed.drop(columns=columns_to_drop)

# Output the result
print(df_preprocessed_non_text)
       favorites_per_day  retweets_per_day  tweets_per_day      tweet_id  \
0               0.000000          0.000000       28.163452  6.587300e+17   
1               0.015564          0.000000        1.710002  6.587300e+17   
2               2.148520          0.000279        1.568118  6.587300e+17   
3               0.036220          0.000000        0.303568  6.587300e+17   
4               9.802469          0.000000        8.264250  6.587300e+17   
...                  ...               ...             ...           ...   
18831           0.090663          0.000000        0.235065  6.587400e+17   
18832           0.569067          0.000000        3.062274  6.587300e+17   
18833           0.011369          0.000000        6.007049  6.587400e+17   
18834          16.340642          0.000000       12.940919  6.587300e+17   
18835           0.878971          0.000000        0.767130  6.587400e+17   

       profile_created_year  tweet_created_year  tweet_location_encoded  \
0                      2013                2015                0.000053   
1                      2012                2015                0.363294   
2                      2014                2015                0.000053   
3                      2009                2015                0.000159   
4                      2014                2015                0.363294   
...                     ...                 ...                     ...   
18831                  2015                2015                0.000106   
18832                  2012                2015                0.000531   
18833                  2012                2015                0.000106   
18834                  2012                2015                0.000106   
18835                  2014                2015                0.363294   

       user_timezone_encoded  
0                   0.001699  
1                   0.127309  
2                   0.002071  
3                   0.105755  
4                   0.381344  
...                      ...  
18831               0.381344  
18832               0.381344  
18833               0.381344  
18834               0.381344  
18835               0.381344  

[18836 rows x 8 columns]
In [23]:
boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.7, random_state=42)

# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
Out[23]:
GradientBoostingRegressor(n_estimators=50, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=50, random_state=42)
In [24]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test_non_text, y_pred)
print(f"Mean Squared Error: {mse}")

y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02902462759816569
Mean Squared Error: 0.028510429352024164
In [25]:
# Get feature importances from the model
feature_importances = boosted_reg_non_text.feature_importances_

# Get column names from X_train
column_names = X_train.columns

# Create a DataFrame with feature importances and corresponding column names
feature_importance_df = pd.DataFrame({
    'Feature': column_names,
    'Importance in percentage': feature_importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)

# Output the DataFrame
print(feature_importance_df)
                  Feature  Importance in percentage
2          tweets_per_day                  0.355385
0       favorites_per_day                  0.237008
4    profile_created_year                  0.221512
7   user_timezone_encoded                  0.120650
6  tweet_location_encoded                  0.046550
3                tweet_id                  0.012980
1        retweets_per_day                  0.005914
5      tweet_created_year                  0.000000
In [26]:
# Convert DataFrame to a long format suitable for Seaborn

# Create bar plot using Seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')

# Add a title
plt.title('Feature Importances')

# Show the plot
plt.show()
No description has been provided for this image
In [27]:
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred

#adding the dataset gender confidence
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset

print(df_preprocessed_non_text["gender_confidence_pred"].describe())
count    18836.000000
mean         0.901057
std          0.036071
min          0.531954
25%          0.884174
50%          0.907729
75%          0.927289
max          0.983469
Name: gender_confidence_pred, dtype: float64
In [28]:
#Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred

misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]

non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]


print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'>
Index: 1059 entries, 11 to 18738
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   favorites_per_day       1059 non-null   float64
 1   retweets_per_day        1059 non-null   float64
 2   tweets_per_day          1059 non-null   float64
 3   tweet_id                1059 non-null   float64
 4   profile_created_year    1059 non-null   int32  
 5   tweet_created_year      1059 non-null   int32  
 6   tweet_location_encoded  1059 non-null   float64
 7   user_timezone_encoded   1059 non-null   float64
 8   gender_confidence_pred  1059 non-null   float64
 9   gender:confidence       1059 non-null   float64
 10  difference              1059 non-null   float64
dtypes: float64(9), int32(2)
memory usage: 91.0 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 11 to 18330
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   favorites_per_day       256 non-null    float64
 1   retweets_per_day        256 non-null    float64
 2   tweets_per_day          256 non-null    float64
 3   tweet_id                256 non-null    float64
 4   profile_created_year    256 non-null    int32  
 5   tweet_created_year      256 non-null    int32  
 6   tweet_location_encoded  256 non-null    float64
 7   user_timezone_encoded   256 non-null    float64
 8   gender_confidence_pred  256 non-null    float64
 9   gender:confidence       256 non-null    float64
 10  difference              256 non-null    float64
dtypes: float64(9), int32(2)
memory usage: 22.0 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 803 entries, 15 to 18738
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   favorites_per_day       803 non-null    float64
 1   retweets_per_day        803 non-null    float64
 2   tweets_per_day          803 non-null    float64
 3   tweet_id                803 non-null    float64
 4   profile_created_year    803 non-null    int32  
 5   tweet_created_year      803 non-null    int32  
 6   tweet_location_encoded  803 non-null    float64
 7   user_timezone_encoded   803 non-null    float64
 8   gender_confidence_pred  803 non-null    float64
 9   gender:confidence       803 non-null    float64
 10  difference              803 non-null    float64
dtypes: float64(9), int32(2)
memory usage: 69.0 KB
None
In [29]:
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_non_text.index)

# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
             [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')

# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
             [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
C:\Users\Shahl\AppData\Local\Temp\ipykernel_25768\3000832750.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
No description has been provided for this image
In [30]:
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y, y_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
No description has been provided for this image
In [31]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df_preprocessed_non_text, x='difference', bins=10, kde=True, color='skyblue')
plt.xlabel('Deviation')
plt.ylabel('Frequency')
plt.title('"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
Out[31]:
Text(0.5, 1.0, '"Histogram of Prediction Errors: Negative Values Indicate dataset gender confidence > predicted gender confidence"')
No description has been provided for this image

Using linear regression¶

A simple linear regression technique is now employed

Regularization is employed (even though we have a very simple low bias model) in order to ensure we do not overfit

In [8]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed)

model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

#print(results.summary())
In [12]:
#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_lin_pred)
print(f"Mean Squared Error: {mse}")

mse = mean_squared_error(y, y_lin_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.029905412647208366
Mean Squared Error: 0.02992069210622919
In [18]:
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred


misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]

non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]


print(misclassified_df.info())
print(non_train_misclassify.info())
print(train_misclassify.info())
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 3011 entries, favorites_per_day to gender_confidence_pred
dtypes: float64(3009), int32(2)
memory usage: 0.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 3011 entries, favorites_per_day to gender_confidence_pred
dtypes: float64(3009), int32(2)
memory usage: 0.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 3011 entries, favorites_per_day to gender_confidence_pred
dtypes: float64(3009), int32(2)
memory usage: 0.0 bytes
None
In [20]:
# Edit misclassified_df to include 'in X_train'
misclassified_df["in X_train"] = misclassified_df.index.isin(X_train_lin.index)

# Create subsets for the two plots
df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Points in X_train
sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
             [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[0].set_xlabel('Dataset Gender Confidence')
axes[0].set_ylabel('Predicted Gender Confidence')
axes[0].set_title(f'In X_train\nTotal Samples: {len(df_in_X_train)}')

# Plot 2: Points not in X_train
sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
             [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
axes[1].set_xlabel('Dataset Gender Confidence')
axes[1].set_ylabel('Predicted Gender Confidence')
axes[1].set_title(f'Not in X_train\nTotal Samples: {len(df_not_in_X_train)}')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [35]:
#Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y.to_numpy(), y_lin_tot_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Dataset Gender Confidence')
plt.ylabel('Predicted Gender Confidence')
plt.title('Predicted gender confidence vs. Dataset gender confidence')
plt.show()
No description has been provided for this image

Without text features¶

In [ ]:
X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed)

model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

#run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_lin_pred)
print(f"Mean Squared Error: {mse}")

mse = mean_squared_error(y, y_lin_tot_pred)
print(f"Mean Squared Error: {mse}")