In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import zipfile as zp
import os
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
In [ ]:
# Read the data
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')

# Display the data
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  profile_yn_gold        50 non-null     object 
 16  profileimage           20050 non-null  object 
 17  retweet_count          20050 non-null  int64  
 18  sidebar_color          20050 non-null  object 
 19  text                   20050 non-null  object 
 20  tweet_coord            159 non-null    object 
 21  tweet_count            20050 non-null  int64  
 22  tweet_created          20050 non-null  object 
 23  tweet_id               20050 non-null  float64
 24  tweet_location         12565 non-null  object 
 25  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
Out[ ]:
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at gender gender:confidence profile_yn profile_yn:confidence created ... profileimage retweet_count sidebar_color text tweet_coord tweet_count tweet_created tweet_id tweet_location user_timezone
0 815719226 False finalized 3 10/26/15 23:24 male 1.0000 yes 1.0 12/5/13 1:48 ... https://pbs.twimg.com/profile_images/414342229... 0 FFFFFF Robbie E Responds To Critics After Win Against... NaN 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai
1 815719227 False finalized 3 10/26/15 23:30 male 1.0000 yes 1.0 10/1/12 13:51 ... https://pbs.twimg.com/profile_images/539604221... 0 C0DEED ‰ÛÏIt felt like they were my friends and I was... NaN 7471 10/26/15 12:40 6.587300e+17 NaN Eastern Time (US & Canada)
2 815719228 False finalized 3 10/26/15 23:33 male 0.6625 yes 1.0 11/28/14 11:30 ... https://pbs.twimg.com/profile_images/657330418... 1 C0DEED i absolutely adore when louis starts the songs... NaN 5617 10/26/15 12:40 6.587300e+17 clcncl Belgrade
3 815719229 False finalized 3 10/26/15 23:10 male 1.0000 yes 1.0 6/11/09 22:39 ... https://pbs.twimg.com/profile_images/259703936... 0 C0DEED Hi @JordanSpieth - Looking at the url - do you... NaN 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada)
4 815719230 False finalized 3 10/27/15 1:15 female 1.0000 yes 1.0 4/16/14 13:23 ... https://pbs.twimg.com/profile_images/564094871... 0 0 Watching Neighbours on Sky+ catching up with t... NaN 31462 10/26/15 12:40 6.587300e+17 NaN NaN

5 rows × 26 columns

Handling Missing Data¶

In [ ]:
# Dropping columns with more than 90% missing values
df_cleaned = df.drop(columns=['gender_gold', 'profile_yn_gold', 'tweet_coord'])

# Filling missing values in 'description', 'user_timezone', and 'tweet_location' with a placeholder 'Unknown'
df_cleaned['description'].fillna('Unknown', inplace=True)

df_cleaned['tweet_location'].fillna('Unknown', inplace=True)

# Dropping rows where 'gender' is missing (as it's a small percentage of rows with missing data)
df_cleaned = df_cleaned.dropna(subset=['gender'])

# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])

# Now that we have handled the missing data, you can proceed with further analysis
df_cleaned.info()  # Display the structure of the cleaned dataset
df_cleaned.head()  # Display the first few rows of the cleaned dataset
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               19953 non-null  int64  
 1   _golden                19953 non-null  bool   
 2   _unit_state            19953 non-null  object 
 3   _trusted_judgments     19953 non-null  int64  
 4   _last_judgment_at      19903 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      19953 non-null  float64
 7   profile_yn:confidence  19953 non-null  float64
 8   created                19953 non-null  object 
 9   description            19953 non-null  object 
 10  fav_number             19953 non-null  int64  
 11  link_color             19953 non-null  object 
 12  name                   19953 non-null  object 
 13  profileimage           19953 non-null  object 
 14  retweet_count          19953 non-null  int64  
 15  sidebar_color          19953 non-null  object 
 16  text                   19953 non-null  object 
 17  tweet_count            19953 non-null  int64  
 18  tweet_created          19953 non-null  object 
 19  tweet_id               19953 non-null  float64
 20  tweet_location         19953 non-null  object 
 21  user_timezone          12185 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
Out[ ]:
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at gender gender:confidence profile_yn:confidence created description ... name profileimage retweet_count sidebar_color text tweet_count tweet_created tweet_id tweet_location user_timezone
0 815719226 False finalized 3 10/26/15 23:24 male 1.0000 1.0 12/5/13 1:48 i sing my own rhythm. ... sheezy0 https://pbs.twimg.com/profile_images/414342229... 0 FFFFFF Robbie E Responds To Critics After Win Against... 110964 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai
1 815719227 False finalized 3 10/26/15 23:30 male 1.0000 1.0 10/1/12 13:51 I'm the author of novels filled with family dr... ... DavdBurnett https://pbs.twimg.com/profile_images/539604221... 0 C0DEED ‰ÛÏIt felt like they were my friends and I was... 7471 10/26/15 12:40 6.587300e+17 Unknown Eastern Time (US & Canada)
2 815719228 False finalized 3 10/26/15 23:33 male 0.6625 1.0 11/28/14 11:30 louis whining and squealing and all ... lwtprettylaugh https://pbs.twimg.com/profile_images/657330418... 1 C0DEED i absolutely adore when louis starts the songs... 5617 10/26/15 12:40 6.587300e+17 clcncl Belgrade
3 815719229 False finalized 3 10/26/15 23:10 male 1.0000 1.0 6/11/09 22:39 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... douggarland https://pbs.twimg.com/profile_images/259703936... 0 C0DEED Hi @JordanSpieth - Looking at the url - do you... 1693 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada)
4 815719230 False finalized 3 10/27/15 1:15 female 1.0000 1.0 4/16/14 13:23 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... WilfordGemma https://pbs.twimg.com/profile_images/564094871... 0 0 Watching Neighbours on Sky+ catching up with t... 31462 10/26/15 12:40 6.587300e+17 Unknown NaN

5 rows × 22 columns

Exploratory Data Analysis (EDA)¶

In [ ]:
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [ ]:
# Distribution of tweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['tweet_count'], kde=True, bins=30)
plt.title('Distribution of Tweet Count')
plt.xlabel('Tweet Count')
plt.ylabel('Density')
plt.show()
No description has been provided for this image
In [ ]:
# Distribution of retweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['retweet_count'], kde=True, bins=30)
plt.title('Distribution of Retweet Count')
plt.xlabel('Retweet Count')
plt.ylabel('Density')
plt.show()
No description has been provided for this image
In [ ]:
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
No description has been provided for this image
In [ ]:
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
<ipython-input-9-329074fae944>:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
<ipython-input-9-329074fae944>:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
In [ ]:
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [ ]:
# Exploring 'link_color' and 'sidebar_color' features
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Profile Link Colors')
plt.ylabel('Link Color')
plt.xlabel('Count')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Sidebar Colors')
plt.ylabel('Sidebar Color')
plt.xlabel('Count')
plt.show()
No description has been provided for this image
No description has been provided for this image

Preprocessing¶

In [ ]:
df_cleaned = df_cleaned[df_cleaned['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])

# change male=>0, female=>1, brand=>2
df_cleaned.loc[df['gender'] == 'male', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'female', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_cleaned.head()
<ipython-input-12-ed2064a24a69>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])
Out[ ]:
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at gender gender:confidence profile_yn:confidence created description ... retweet_count sidebar_color text tweet_count tweet_created tweet_id tweet_location user_timezone profile_created_year tweet_created_year
0 815719226 False finalized 3 10/26/15 23:24 0 1.0000 1.0 12/5/13 1:48 i sing my own rhythm. ... -0.030196 FFFFFF Robbie E Responds To Critics After Win Against... 0.602953 10/26/15 12:40 6.587300e+17 main; @Kan1shk3 Chennai 2013 2015
1 815719227 False finalized 3 10/26/15 23:30 0 1.0000 1.0 10/1/12 13:51 I'm the author of novels filled with family dr... ... -0.030196 C0DEED ‰ÛÏIt felt like they were my friends and I was... -0.265805 10/26/15 12:40 6.587300e+17 Unknown Eastern Time (US & Canada) 2012 2015
2 815719228 False finalized 3 10/26/15 23:33 0 0.6625 1.0 11/28/14 11:30 louis whining and squealing and all ... 0.335804 C0DEED i absolutely adore when louis starts the songs... -0.281368 10/26/15 12:40 6.587300e+17 clcncl Belgrade 2014 2015
3 815719229 False finalized 3 10/26/15 23:10 0 1.0000 1.0 6/11/09 22:39 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... ... -0.030196 C0DEED Hi @JordanSpieth - Looking at the url - do you... -0.314308 10/26/15 12:40 6.587300e+17 Palo Alto, CA Pacific Time (US & Canada) 2009 2015
4 815719230 False finalized 3 10/27/15 1:15 0 1.0000 1.0 4/16/14 13:23 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... ... -0.030196 0 Watching Neighbours on Sky+ catching up with t... -0.064416 10/26/15 12:40 6.587300e+17 Unknown NaN 2014 2015

5 rows × 24 columns

In [ ]:
import pandas as pd

# First, encode the 'gender' column to numeric values
df_cleaned['gender'] = df_cleaned['gender'].replace({'male': 0, 'female': 1, 'brand': 2})

# Select numerical columns
numerical_columns = ['gender:confidence', 'profile_yn:confidence', 'fav_number', 'retweet_count', 'tweet_count', 'tweet_id']

# Calculate the Pearson correlation with the target variable 'gender'
correlations = df_cleaned[numerical_columns].corrwith(df_cleaned['gender'])

print("Correlations with target (gender):")
print(correlations)
Correlations with target (gender):
gender:confidence       -0.129078
profile_yn:confidence   -0.007602
fav_number              -0.125455
retweet_count            0.008353
tweet_count              0.119731
tweet_id                -0.122541
dtype: float64
In [ ]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

# List of categorical columns
categorical_columns = ['_unit_state', 'link_color', 'sidebar_color', 'tweet_location', 'user_timezone']  # Add other categorical columns if necessary

# Encode the categorical variables
label_encoder = LabelEncoder()
df_encoded = df_cleaned.copy()
for col in categorical_columns:
    df_encoded[col] = label_encoder.fit_transform(df_encoded[col].astype(str))

# Apply chi-squared test
X = df_encoded[categorical_columns]
y = df_encoded['gender']

chi_scores = chi2(X, y)

print("Chi-square scores:", chi_scores)
Chi-square scores: (array([2.91137794e-01, 2.91763576e+05, 4.20948757e+02, 4.20571655e+04,
       5.80444260e+03]), array([5.89492326e-01, 0.00000000e+00, 1.51683635e-93, 0.00000000e+00,
       0.00000000e+00]))
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix for numerical columns including 'gender'
corr_matrix = df_cleaned[numerical_columns + ['gender']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features and Target")
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# drop columns that are irrelevant

col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
       'retweet_count', 'sidebar_color', 'text', 'tweet_count', 'tweet_id', 'tweet_location', 'user_timezone'
       ]

df_preprocessed = df_cleaned[col]

# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']])

# List of categorical columns
categorical_columns = ['link_color', 'sidebar_color', 'tweet_location', 'user_timezone']

# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' prevents the dummy variable trap (removes first category)

# Apply OneHotEncoder using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_columns)
    ],
    remainder='passthrough'  # Keeps the rest of the columns as they are (numerical features)
)

# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df_preprocessed)

# Convert the transformed array back to a DataFrame and retain column names
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)

# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_preprocessed.head()
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(
---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-16-a1910b6b304b> in <cell line: 38>()
     36 
     37 # Convert the transformed array back to a DataFrame and retain column names
---> 38 encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
     39 df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)
     40 

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py in get_feature_names_out(self, input_features)
   1208             Transformed feature names.
   1209         """
-> 1210         check_is_fitted(self)
   1211         input_features = _check_feature_names_in(self, input_features)
   1212         cats = [

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
   1459 
   1460     if not _is_fitted(estimator, attributes, all_or_any):
-> 1461         raise NotFittedError(msg % {"name": type(estimator).__name__})
   1462 
   1463 

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
In [ ]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# drop columns that are irrelevant

col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
       'retweet_count', 'sidebar_color', 'text', 'tweet_count','tweet_id'
       ]

df_preprocessed = df_cleaned[col]

# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']])

# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_preprocessed.head()
Out[ ]:
gender gender:confidence description fav_number link_color retweet_count sidebar_color text tweet_count tweet_id
0 0 1.0000 i sing my own rhythm. -0.353977 08C2C2 -0.030196 FFFFFF Robbie E Responds To Critics After Win Against... 0.602953 6.587300e+17
1 0 1.0000 I'm the author of novels filled with family dr... -0.348524 0084B4 -0.030196 C0DEED ‰ÛÏIt felt like they were my friends and I was... -0.265805 6.587300e+17
2 0 0.6625 louis whining and squealing and all 0.263273 ABB8C2 0.335804 C0DEED i absolutely adore when louis starts the songs... -0.281368 6.587300e+17
3 0 1.0000 Mobile guy. 49ers, Shazam, Google, Kleiner Pe... -0.337776 0084B4 -0.030196 C0DEED Hi @JordanSpieth - Looking at the url - do you... -0.314308 6.587300e+17
4 0 1.0000 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... 2.639077 3B94D9 -0.030196 0 Watching Neighbours on Sky+ catching up with t... -0.064416 6.587300e+17
In [ ]:
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# The imbalanceness can be handled either using the model attribute class_weight or applying sampling techniques.
No description has been provided for this image

NLP Processing

In [ ]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Out[ ]:
True
In [ ]:
df_status = df_preprocessed.copy()
df_status = pd.concat([df_status['gender'], df_status['description']], axis=1)

df_status
Out[ ]:
gender description
0 0 i sing my own rhythm.
1 0 I'm the author of novels filled with family dr...
2 0 louis whining and squealing and all
3 0 Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4 0 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
... ... ...
20045 0 (rp)
20046 0 Whatever you like, it's not a problem at all. ...
20047 0 #TeamBarcelona ..You look lost so you should f...
20048 0 Anti-statist; I homeschool my kids. Aspiring t...
20049 0 Teamwork makes the dream work.

18836 rows × 2 columns

In [ ]:
# make all lowercase since "Run" is not the same as "run" for machine computation
import re

description = []

for x in df_status['description']:
    desc = re.sub("[^a-zA-Z]"," ",x)
    desc = desc.lower()
    description.append(desc)

df_status['description'] = description
df_status
Out[ ]:
gender description
0 0 i sing my own rhythm
1 0 i m the author of novels filled with family dr...
2 0 louis whining and squealing and all
3 0 mobile guy ers shazam google kleiner pe...
4 0 ricky wilson the best frontman kaiser chiefs t...
... ... ...
20045 0 rp
20046 0 whatever you like it s not a problem at all ...
20047 0 teambarcelona you look lost so you should f...
20048 0 anti statist i homeschool my kids aspiring t...
20049 0 teamwork makes the dream work

18836 rows × 2 columns

In [ ]:
# remove stopwords in sentence ==> i,a,the,an,and,.,me,........
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words


df_status['tokenized'] = df_status['description'].apply(remove_stopwords)
df_status
Out[ ]:
gender description tokenized
0 0 i sing my own rhythm [sing, rhythm]
1 0 i m the author of novels filled with family dr... [author, novels, filled, family, drama, romance]
2 0 louis whining and squealing and all [louis, whining, squealing]
3 0 mobile guy ers shazam google kleiner pe... [mobile, guy, ers, shazam, google, kleiner, pe...
4 0 ricky wilson the best frontman kaiser chiefs t... [ricky, wilson, best, frontman, kaiser, chiefs...
... ... ... ...
20045 0 rp [rp]
20046 0 whatever you like it s not a problem at all ... [whatever, like, problem, chargernation, forev...
20047 0 teambarcelona you look lost so you should f... [teambarcelona, look, lost, follow, follow, he...
20048 0 anti statist i homeschool my kids aspiring t... [anti, statist, homeschool, kids, aspiring, th...
20049 0 teamwork makes the dream work [teamwork, makes, dream, work]

18836 rows × 3 columns

In [ ]:
# count word in sentence by changing tokenized to vectorizor (for machine compute)
# CountVectorizer input must be string with one long list

from sklearn.feature_extraction.text import CountVectorizer

max_features = 1500
corpus = [' '.join(words) for words in df_status['tokenized']]

vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")
X = vectorizer.fit_transform(corpus).toarray()

# let's see X in dataframe
df_ = pd.DataFrame(X, columns=vectorizer.get_feature_names_out(), index=df_status.index)

df_
Out[ ]:
academy account achieve act action active activist actor actress actually ... yes yo yoga york young youth youtube youtuber yrs zayn
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20045 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20046 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20047 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20048 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20049 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

18836 rows × 1500 columns

In [ ]:
y = df_preprocessed['gender'].values # Create an array
y # gender ==> our target in the model
Out[ ]:
array([0, 0, 0, ..., 0, 0, 0])
In [ ]:
# Now drop the processed columns ('description', 'text', and categorical) from the original dataset
df_preprocessed = df_preprocessed.drop(columns=['description', 'text', 'link_color', 'sidebar_color'])

#WITH TARGET INFORMATION REMOVED
df_preprocessed_X = df_preprocessed.drop(columns=['gender', 'gender:confidence'])

# Combine the text features with the other preprocessed features
X_combined = np.hstack((df_preprocessed_X.values, X))
In [ ]:
df_preprocessed.head()
Out[ ]:
gender gender:confidence fav_number retweet_count tweet_count tweet_id
0 0 1.0000 -0.353977 -0.030196 0.602953 6.587300e+17
1 0 1.0000 -0.348524 -0.030196 -0.265805 6.587300e+17
2 0 0.6625 0.263273 0.335804 -0.281368 6.587300e+17
3 0 1.0000 -0.337776 -0.030196 -0.314308 6.587300e+17
4 0 1.0000 2.639077 -0.030196 -0.064416 6.587300e+17

Regression Tasks¶

The choosen regression model will be a gradient boosted regression decision tree.

In [ ]:
y = df_preprocessed["gender:confidence"]
In [ ]:
from sklearn.ensemble import GradientBoostingRegressor

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

boosted_reg = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)
Out[ ]:
GradientBoostingRegressor(n_estimators=10, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=10, random_state=42)
In [ ]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = boosted_reg.predict(X_test)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.027876325391586427
In [ ]:
y_tot_pred = boosted_reg.predict(X_combined)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02896527615100315
In [ ]:
boosted_reg.feature_importances_
Out[ ]:
array([0.31033774, 0.        , 0.09989529, ..., 0.        , 0.        ,
       0.        ])

Find the rows with the largest difference in gender confidence

In [ ]:
 
Out[ ]:
gender:confidence
0 1.0000
1 1.0000
2 0.6625
3 1.0000
4 1.0000
... ...
20045 1.0000
20046 1.0000
20047 1.0000
20048 0.8489
20049 1.0000

18836 rows × 1 columns


Example Usage¶

In [ ]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format, which is the format that XGBoost expects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'multi:softmax',  # Specify the objective for multi-class classification
    'num_class': len(np.unique(y_train)),  # Number of classes
    'max_depth': 3,  # Maximum tree depth
    'eta': 0.1,  # Learning rate
    'subsample': 0.8,  # Fraction of samples used for training each tree
    'colsample_bytree': 0.8,  # Fraction of features used for training each tree
    'eval_metric': 'mlogloss'  # Evaluation metric
}

# Train the XGBoost model
num_round = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
y_pred = bst.predict(dtest)
In [ ]:
# Calculate accuracy
accuracy = accuracy_score(y_test.tolist(), y_pred.tolist())

print(f"Accuracy: {accuracy:.2f}")
Accuracy: 1.00
In [ ]:
# Generate the classification report
report = classification_report(y_test.tolist(), y_pred.tolist())

# Print the classification report
print("XGBoost Classification Report:\n", report)
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2585
           1       1.00      1.00      1.00      1183

    accuracy                           1.00      3768
   macro avg       1.00      1.00      1.00      3768
weighted avg       1.00      1.00      1.00      3768

In [ ]:
from sklearn.naive_bayes import GaussianNB

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# change y_train, y_test dtype from object to int64
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)

# Create a Naive Bayes classifier (Gaussian Naive Bayes)
nb_classifier = GaussianNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nb_classifier.predict(X_test)
In [ ]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Naive Bayes Classification Report:\n", report)
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.69      1.00      0.81      2585
           1       0.00      0.00      0.00      1183

    accuracy                           0.69      3768
   macro avg       0.34      0.50      0.41      3768
weighted avg       0.47      0.69      0.56      3768

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
data = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')  # Replace with actual file path

# Step 1: Data Cleaning
# Dropping columns with too many missing values or irrelevant ones
data_cleaned = data.drop(['_unit_id', '_last_judgment_at', 'profileimage', 'tweet_id', 'tweet_created'], axis=1)

# Fill missing values (example: fill numerical columns with mean, categorical with mode)
data_cleaned['fav_number'].fillna(data_cleaned['fav_number'].mean(), inplace=True)
data_cleaned['gender'].fillna(data_cleaned['gender'].mode()[0], inplace=True)
data_cleaned['tweet_location'].fillna('Unknown', inplace=True)

# Step 2: Handle Categorical Variables
# Convert categorical columns to numeric using OneHotEncoding or get_dummies
categorical_columns = ['_unit_state', 'gender', 'profile_yn', 'tweet_location']  # Add other categorical columns if necessary
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

# Step 3: Define Target and Features
# Target: 'fav_number' (example), Features: all other columns
X = data_cleaned.drop('fav_number', axis=1)
y = data_cleaned['fav_number']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Scale the Data (Now that all features are numeric)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Fit Ridge Regression Model
ridge_reg = Ridge(alpha=1.0)  # Alpha is the regularization strength
ridge_reg.fit(X_train_scaled, y_train)

# Step 7: Predict and Evaluate
y_pred = ridge_reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Step 8: Apriori Algorithm (Assuming we are interested in user behavior patterns)
# For apriori, we need to transform data into boolean values (e.g., presence of an attribute)
transactions = data_cleaned[['retweet_count', 'profile_yn_True', 'tweet_location_Unknown']] > 0

# Apply the apriori algorithm
frequent_itemsets = apriori(transactions, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

print(frequent_itemsets)
print(rules)
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-5222a9351d56> in <cell line: 36>()
     34 # Step 5: Scale the Data (Now that all features are numeric)
     35 scaler = StandardScaler()
---> 36 X_train_scaled = scaler.fit_transform(X_train)
     37 X_test_scaled = scaler.transform(X_test)
     38 

/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
    155     @wraps(f)
    156     def wrapped(self, X, *args, **kwargs):
--> 157         data_to_wrap = f(self, X, *args, **kwargs)
    158         if isinstance(data_to_wrap, tuple):
    159             # only wrap the first output for cross decomposition

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    914         if y is None:
    915             # fit method of arity 1 (unsupervised transformation)
--> 916             return self.fit(X, **fit_params).transform(X)
    917         else:
    918             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in fit(self, X, y, sample_weight)
    837         # Reset internal state before fitting
    838         self._reset()
--> 839         return self.partial_fit(X, y, sample_weight)
    840 
    841     @_fit_context(prefer_skip_nested_validation=True)

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
   1150                 )
   1151             ):
-> 1152                 return fit_method(estimator, *args, **kwargs)
   1153 
   1154         return wrapper

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y, sample_weight)
    873         """
    874         first_call = not hasattr(self, "n_samples_seen_")
--> 875         X = self._validate_data(
    876             X,
    877             accept_sparse=("csr", "csc"),

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    603                 out = X, y
    604         elif not no_val_X and no_val_y:
--> 605             out = check_array(X, input_name="X", **check_params)
    606         elif no_val_X and not no_val_y:
    607             out = _check_y(y, **check_params)

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    834         # Use the original dtype for conversion if dtype is None
    835         new_dtype = dtype_orig if dtype is None else dtype
--> 836         array = array.astype(new_dtype)
    837         # Since we converted here, we do not need to convert again later
    838         dtype = None

/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   6532         else:
   6533             # else, only a single dtype is given
-> 6534             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6535             res = self._constructor_from_mgr(new_data, axes=new_data.axes)
   6536             return res.__finalize__(self, method="astype")

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    412             copy = False
    413 
--> 414         return self.apply(
    415             "astype",
    416             dtype=dtype,

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
    352                 applied = b.apply(f, **kwargs)
    353             else:
--> 354                 applied = getattr(b, f)(**kwargs)
    355             result_blocks = extend_blocks(applied, result_blocks)
    356 

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, using_cow)
    614         values = self.values
    615 
--> 616         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    617 
    618         new_values = maybe_coerce_values(new_values)

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array_safe(values, dtype, copy, errors)
    236 
    237     try:
--> 238         new_values = astype_array(values, dtype, copy=copy)
    239     except (ValueError, TypeError):
    240         # e.g. _astype_nansafe can fail on object-dtype of strings

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array(values, dtype, copy)
    181 
    182     else:
--> 183         values = _astype_nansafe(values, dtype, copy=copy)
    184 
    185     # in pandas we don't store numpy str dtypes, so convert to object

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in _astype_nansafe(arr, dtype, copy, skipna)
    132     if copy or arr.dtype == object or dtype == object:
    133         # Explicit copy, or required since NumPy can't view from / to object.
--> 134         return arr.astype(dtype, copy=True)
    135 
    136     return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '2/7/14 11:55'
In [ ]:
_