In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import zipfile as zp
import os
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
In [ ]:
# Read the data
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Display the data
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20050 entries, 0 to 20049 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 20050 non-null int64 1 _golden 20050 non-null bool 2 _unit_state 20050 non-null object 3 _trusted_judgments 20050 non-null int64 4 _last_judgment_at 20000 non-null object 5 gender 19953 non-null object 6 gender:confidence 20024 non-null float64 7 profile_yn 20050 non-null object 8 profile_yn:confidence 20050 non-null float64 9 created 20050 non-null object 10 description 16306 non-null object 11 fav_number 20050 non-null int64 12 gender_gold 50 non-null object 13 link_color 20050 non-null object 14 name 20050 non-null object 15 profile_yn_gold 50 non-null object 16 profileimage 20050 non-null object 17 retweet_count 20050 non-null int64 18 sidebar_color 20050 non-null object 19 text 20050 non-null object 20 tweet_coord 159 non-null object 21 tweet_count 20050 non-null int64 22 tweet_created 20050 non-null object 23 tweet_id 20050 non-null float64 24 tweet_location 12565 non-null object 25 user_timezone 12252 non-null object dtypes: bool(1), float64(3), int64(5), object(17) memory usage: 3.8+ MB
Out[ ]:
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn | profile_yn:confidence | created | ... | profileimage | retweet_count | sidebar_color | text | tweet_coord | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | male | 1.0000 | yes | 1.0 | 12/5/13 1:48 | ... | https://pbs.twimg.com/profile_images/414342229... | 0 | FFFFFF | Robbie E Responds To Critics After Win Against... | NaN | 110964 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | male | 1.0000 | yes | 1.0 | 10/1/12 13:51 | ... | https://pbs.twimg.com/profile_images/539604221... | 0 | C0DEED | ÛÏIt felt like they were my friends and I was... | NaN | 7471 | 10/26/15 12:40 | 6.587300e+17 | NaN | Eastern Time (US & Canada) |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | male | 0.6625 | yes | 1.0 | 11/28/14 11:30 | ... | https://pbs.twimg.com/profile_images/657330418... | 1 | C0DEED | i absolutely adore when louis starts the songs... | NaN | 5617 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | male | 1.0000 | yes | 1.0 | 6/11/09 22:39 | ... | https://pbs.twimg.com/profile_images/259703936... | 0 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | NaN | 1693 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | female | 1.0000 | yes | 1.0 | 4/16/14 13:23 | ... | https://pbs.twimg.com/profile_images/564094871... | 0 | 0 | Watching Neighbours on Sky+ catching up with t... | NaN | 31462 | 10/26/15 12:40 | 6.587300e+17 | NaN | NaN |
5 rows × 26 columns
Handling Missing Data¶
In [ ]:
# Dropping columns with more than 90% missing values
df_cleaned = df.drop(columns=['gender_gold', 'profile_yn_gold', 'tweet_coord'])
# Filling missing values in 'description', 'user_timezone', and 'tweet_location' with a placeholder 'Unknown'
df_cleaned['description'].fillna('Unknown', inplace=True)
df_cleaned['tweet_location'].fillna('Unknown', inplace=True)
# Dropping rows where 'gender' is missing (as it's a small percentage of rows with missing data)
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis
df_cleaned.info() # Display the structure of the cleaned dataset
df_cleaned.head() # Display the first few rows of the cleaned dataset
<class 'pandas.core.frame.DataFrame'> Index: 19953 entries, 0 to 20049 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 19953 non-null int64 1 _golden 19953 non-null bool 2 _unit_state 19953 non-null object 3 _trusted_judgments 19953 non-null int64 4 _last_judgment_at 19903 non-null object 5 gender 19953 non-null object 6 gender:confidence 19953 non-null float64 7 profile_yn:confidence 19953 non-null float64 8 created 19953 non-null object 9 description 19953 non-null object 10 fav_number 19953 non-null int64 11 link_color 19953 non-null object 12 name 19953 non-null object 13 profileimage 19953 non-null object 14 retweet_count 19953 non-null int64 15 sidebar_color 19953 non-null object 16 text 19953 non-null object 17 tweet_count 19953 non-null int64 18 tweet_created 19953 non-null object 19 tweet_id 19953 non-null float64 20 tweet_location 19953 non-null object 21 user_timezone 12185 non-null object dtypes: bool(1), float64(3), int64(5), object(13) memory usage: 3.4+ MB
Out[ ]:
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn:confidence | created | description | ... | name | profileimage | retweet_count | sidebar_color | text | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | male | 1.0000 | 1.0 | 12/5/13 1:48 | i sing my own rhythm. | ... | sheezy0 | https://pbs.twimg.com/profile_images/414342229... | 0 | FFFFFF | Robbie E Responds To Critics After Win Against... | 110964 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | male | 1.0000 | 1.0 | 10/1/12 13:51 | I'm the author of novels filled with family dr... | ... | DavdBurnett | https://pbs.twimg.com/profile_images/539604221... | 0 | C0DEED | ÛÏIt felt like they were my friends and I was... | 7471 | 10/26/15 12:40 | 6.587300e+17 | Unknown | Eastern Time (US & Canada) |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | male | 0.6625 | 1.0 | 11/28/14 11:30 | louis whining and squealing and all | ... | lwtprettylaugh | https://pbs.twimg.com/profile_images/657330418... | 1 | C0DEED | i absolutely adore when louis starts the songs... | 5617 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | male | 1.0000 | 1.0 | 6/11/09 22:39 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | ... | douggarland | https://pbs.twimg.com/profile_images/259703936... | 0 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | 1693 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | female | 1.0000 | 1.0 | 4/16/14 13:23 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | ... | WilfordGemma | https://pbs.twimg.com/profile_images/564094871... | 0 | 0 | Watching Neighbours on Sky+ catching up with t... | 31462 | 10/26/15 12:40 | 6.587300e+17 | Unknown | NaN |
5 rows × 22 columns
Exploratory Data Analysis (EDA)¶
In [ ]:
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
In [ ]:
# Distribution of tweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['tweet_count'], kde=True, bins=30)
plt.title('Distribution of Tweet Count')
plt.xlabel('Tweet Count')
plt.ylabel('Density')
plt.show()
In [ ]:
# Distribution of retweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['retweet_count'], kde=True, bins=30)
plt.title('Distribution of Retweet Count')
plt.xlabel('Retweet Count')
plt.ylabel('Density')
plt.show()
In [ ]:
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
In [ ]:
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
<ipython-input-9-329074fae944>:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year <ipython-input-9-329074fae944>:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
In [ ]:
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('Count')
plt.show()
In [ ]:
# Exploring 'link_color' and 'sidebar_color' features
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Profile Link Colors')
plt.ylabel('Link Color')
plt.xlabel('Count')
plt.show()
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Sidebar Colors')
plt.ylabel('Sidebar Color')
plt.xlabel('Count')
plt.show()
Preprocessing¶
In [ ]:
df_cleaned = df_cleaned[df_cleaned['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])
# change male=>0, female=>1, brand=>2
df_cleaned.loc[df['gender'] == 'male', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'female', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
df_cleaned.head()
<ipython-input-12-ed2064a24a69>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])
Out[ ]:
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn:confidence | created | description | ... | retweet_count | sidebar_color | text | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | profile_created_year | tweet_created_year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | 0 | 1.0000 | 1.0 | 12/5/13 1:48 | i sing my own rhythm. | ... | -0.030196 | FFFFFF | Robbie E Responds To Critics After Win Against... | 0.602953 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai | 2013 | 2015 |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | 0 | 1.0000 | 1.0 | 10/1/12 13:51 | I'm the author of novels filled with family dr... | ... | -0.030196 | C0DEED | ÛÏIt felt like they were my friends and I was... | -0.265805 | 10/26/15 12:40 | 6.587300e+17 | Unknown | Eastern Time (US & Canada) | 2012 | 2015 |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | 0 | 0.6625 | 1.0 | 11/28/14 11:30 | louis whining and squealing and all | ... | 0.335804 | C0DEED | i absolutely adore when louis starts the songs... | -0.281368 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade | 2014 | 2015 |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | 0 | 1.0000 | 1.0 | 6/11/09 22:39 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | ... | -0.030196 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | -0.314308 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) | 2009 | 2015 |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | 0 | 1.0000 | 1.0 | 4/16/14 13:23 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | ... | -0.030196 | 0 | Watching Neighbours on Sky+ catching up with t... | -0.064416 | 10/26/15 12:40 | 6.587300e+17 | Unknown | NaN | 2014 | 2015 |
5 rows × 24 columns
In [ ]:
import pandas as pd
# First, encode the 'gender' column to numeric values
df_cleaned['gender'] = df_cleaned['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Select numerical columns
numerical_columns = ['gender:confidence', 'profile_yn:confidence', 'fav_number', 'retweet_count', 'tweet_count', 'tweet_id']
# Calculate the Pearson correlation with the target variable 'gender'
correlations = df_cleaned[numerical_columns].corrwith(df_cleaned['gender'])
print("Correlations with target (gender):")
print(correlations)
Correlations with target (gender): gender:confidence -0.129078 profile_yn:confidence -0.007602 fav_number -0.125455 retweet_count 0.008353 tweet_count 0.119731 tweet_id -0.122541 dtype: float64
In [ ]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
# List of categorical columns
categorical_columns = ['_unit_state', 'link_color', 'sidebar_color', 'tweet_location', 'user_timezone'] # Add other categorical columns if necessary
# Encode the categorical variables
label_encoder = LabelEncoder()
df_encoded = df_cleaned.copy()
for col in categorical_columns:
df_encoded[col] = label_encoder.fit_transform(df_encoded[col].astype(str))
# Apply chi-squared test
X = df_encoded[categorical_columns]
y = df_encoded['gender']
chi_scores = chi2(X, y)
print("Chi-square scores:", chi_scores)
Chi-square scores: (array([2.91137794e-01, 2.91763576e+05, 4.20948757e+02, 4.20571655e+04, 5.80444260e+03]), array([5.89492326e-01, 0.00000000e+00, 1.51683635e-93, 0.00000000e+00, 0.00000000e+00]))
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Compute the correlation matrix for numerical columns including 'gender'
corr_matrix = df_cleaned[numerical_columns + ['gender']].corr()
# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features and Target")
plt.show()
In [ ]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# drop columns that are irrelevant
col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
'retweet_count', 'sidebar_color', 'text', 'tweet_count', 'tweet_id', 'tweet_location', 'user_timezone'
]
df_preprocessed = df_cleaned[col]
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']])
# List of categorical columns
categorical_columns = ['link_color', 'sidebar_color', 'tweet_location', 'user_timezone']
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(drop='first', sparse=False) # drop='first' prevents the dummy variable trap (removes first category)
# Apply OneHotEncoder using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('cat', one_hot_encoder, categorical_columns)
],
remainder='passthrough' # Keeps the rest of the columns as they are (numerical features)
)
# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df_preprocessed)
# Convert the transformed array back to a DataFrame and retain column names
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)
# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
df_preprocessed.head()
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
--------------------------------------------------------------------------- NotFittedError Traceback (most recent call last) <ipython-input-16-a1910b6b304b> in <cell line: 38>() 36 37 # Convert the transformed array back to a DataFrame and retain column names ---> 38 encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns) 39 df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names) 40 /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py in get_feature_names_out(self, input_features) 1208 Transformed feature names. 1209 """ -> 1210 check_is_fitted(self) 1211 input_features = _check_feature_names_in(self, input_features) 1212 cats = [ /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any) 1459 1460 if not _is_fitted(estimator, attributes, all_or_any): -> 1461 raise NotFittedError(msg % {"name": type(estimator).__name__}) 1462 1463 NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
In [ ]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# drop columns that are irrelevant
col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
'retweet_count', 'sidebar_color', 'text', 'tweet_count','tweet_id'
]
df_preprocessed = df_cleaned[col]
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']])
# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
df_preprocessed.head()
Out[ ]:
gender | gender:confidence | description | fav_number | link_color | retweet_count | sidebar_color | text | tweet_count | tweet_id | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0000 | i sing my own rhythm. | -0.353977 | 08C2C2 | -0.030196 | FFFFFF | Robbie E Responds To Critics After Win Against... | 0.602953 | 6.587300e+17 |
1 | 0 | 1.0000 | I'm the author of novels filled with family dr... | -0.348524 | 0084B4 | -0.030196 | C0DEED | ÛÏIt felt like they were my friends and I was... | -0.265805 | 6.587300e+17 |
2 | 0 | 0.6625 | louis whining and squealing and all | 0.263273 | ABB8C2 | 0.335804 | C0DEED | i absolutely adore when louis starts the songs... | -0.281368 | 6.587300e+17 |
3 | 0 | 1.0000 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | -0.337776 | 0084B4 | -0.030196 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | -0.314308 | 6.587300e+17 |
4 | 0 | 1.0000 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | 2.639077 | 3B94D9 | -0.030196 | 0 | Watching Neighbours on Sky+ catching up with t... | -0.064416 | 6.587300e+17 |
In [ ]:
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
# The imbalanceness can be handled either using the model attribute class_weight or applying sampling techniques.
NLP Processing
In [ ]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip. [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
Out[ ]:
True
In [ ]:
df_status = df_preprocessed.copy()
df_status = pd.concat([df_status['gender'], df_status['description']], axis=1)
df_status
Out[ ]:
gender | description | |
---|---|---|
0 | 0 | i sing my own rhythm. |
1 | 0 | I'm the author of novels filled with family dr... |
2 | 0 | louis whining and squealing and all |
3 | 0 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... |
4 | 0 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... |
... | ... | ... |
20045 | 0 | (rp) |
20046 | 0 | Whatever you like, it's not a problem at all. ... |
20047 | 0 | #TeamBarcelona ..You look lost so you should f... |
20048 | 0 | Anti-statist; I homeschool my kids. Aspiring t... |
20049 | 0 | Teamwork makes the dream work. |
18836 rows × 2 columns
In [ ]:
# make all lowercase since "Run" is not the same as "run" for machine computation
import re
description = []
for x in df_status['description']:
desc = re.sub("[^a-zA-Z]"," ",x)
desc = desc.lower()
description.append(desc)
df_status['description'] = description
df_status
Out[ ]:
gender | description | |
---|---|---|
0 | 0 | i sing my own rhythm |
1 | 0 | i m the author of novels filled with family dr... |
2 | 0 | louis whining and squealing and all |
3 | 0 | mobile guy ers shazam google kleiner pe... |
4 | 0 | ricky wilson the best frontman kaiser chiefs t... |
... | ... | ... |
20045 | 0 | rp |
20046 | 0 | whatever you like it s not a problem at all ... |
20047 | 0 | teambarcelona you look lost so you should f... |
20048 | 0 | anti statist i homeschool my kids aspiring t... |
20049 | 0 | teamwork makes the dream work |
18836 rows × 2 columns
In [ ]:
# remove stopwords in sentence ==> i,a,the,an,and,.,me,........
def remove_stopwords(text):
words = nltk.word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
return filtered_words
df_status['tokenized'] = df_status['description'].apply(remove_stopwords)
df_status
Out[ ]:
gender | description | tokenized | |
---|---|---|---|
0 | 0 | i sing my own rhythm | [sing, rhythm] |
1 | 0 | i m the author of novels filled with family dr... | [author, novels, filled, family, drama, romance] |
2 | 0 | louis whining and squealing and all | [louis, whining, squealing] |
3 | 0 | mobile guy ers shazam google kleiner pe... | [mobile, guy, ers, shazam, google, kleiner, pe... |
4 | 0 | ricky wilson the best frontman kaiser chiefs t... | [ricky, wilson, best, frontman, kaiser, chiefs... |
... | ... | ... | ... |
20045 | 0 | rp | [rp] |
20046 | 0 | whatever you like it s not a problem at all ... | [whatever, like, problem, chargernation, forev... |
20047 | 0 | teambarcelona you look lost so you should f... | [teambarcelona, look, lost, follow, follow, he... |
20048 | 0 | anti statist i homeschool my kids aspiring t... | [anti, statist, homeschool, kids, aspiring, th... |
20049 | 0 | teamwork makes the dream work | [teamwork, makes, dream, work] |
18836 rows × 3 columns
In [ ]:
# count word in sentence by changing tokenized to vectorizor (for machine compute)
# CountVectorizer input must be string with one long list
from sklearn.feature_extraction.text import CountVectorizer
max_features = 1500
corpus = [' '.join(words) for words in df_status['tokenized']]
vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")
X = vectorizer.fit_transform(corpus).toarray()
# let's see X in dataframe
df_ = pd.DataFrame(X, columns=vectorizer.get_feature_names_out(), index=df_status.index)
df_
Out[ ]:
academy | account | achieve | act | action | active | activist | actor | actress | actually | ... | yes | yo | yoga | york | young | youth | youtube | youtuber | yrs | zayn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20045 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20046 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20048 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20049 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
18836 rows × 1500 columns
In [ ]:
y = df_preprocessed['gender'].values # Create an array
y # gender ==> our target in the model
Out[ ]:
array([0, 0, 0, ..., 0, 0, 0])
In [ ]:
# Now drop the processed columns ('description', 'text', and categorical) from the original dataset
df_preprocessed = df_preprocessed.drop(columns=['description', 'text', 'link_color', 'sidebar_color'])
#WITH TARGET INFORMATION REMOVED
df_preprocessed_X = df_preprocessed.drop(columns=['gender', 'gender:confidence'])
# Combine the text features with the other preprocessed features
X_combined = np.hstack((df_preprocessed_X.values, X))
In [ ]:
df_preprocessed.head()
Out[ ]:
gender | gender:confidence | fav_number | retweet_count | tweet_count | tweet_id | |
---|---|---|---|---|---|---|
0 | 0 | 1.0000 | -0.353977 | -0.030196 | 0.602953 | 6.587300e+17 |
1 | 0 | 1.0000 | -0.348524 | -0.030196 | -0.265805 | 6.587300e+17 |
2 | 0 | 0.6625 | 0.263273 | 0.335804 | -0.281368 | 6.587300e+17 |
3 | 0 | 1.0000 | -0.337776 | -0.030196 | -0.314308 | 6.587300e+17 |
4 | 0 | 1.0000 | 2.639077 | -0.030196 | -0.064416 | 6.587300e+17 |
Regression Tasks¶
The choosen regression model will be a gradient boosted regression decision tree.
In [ ]:
y = df_preprocessed["gender:confidence"]
In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model
boosted_reg.fit(X_train, y_train)
Out[ ]:
GradientBoostingRegressor(n_estimators=10, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=10, random_state=42)
In [ ]:
from sklearn.metrics import mean_squared_error
# Make predictions
y_pred = boosted_reg.predict(X_test)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.027876325391586427
In [ ]:
y_tot_pred = boosted_reg.predict(X_combined)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02896527615100315
In [ ]:
boosted_reg.feature_importances_
Out[ ]:
array([0.31033774, 0. , 0.09989529, ..., 0. , 0. , 0. ])
Find the rows with the largest difference in gender confidence
In [ ]:
Out[ ]:
gender:confidence | |
---|---|
0 | 1.0000 |
1 | 1.0000 |
2 | 0.6625 |
3 | 1.0000 |
4 | 1.0000 |
... | ... |
20045 | 1.0000 |
20046 | 1.0000 |
20047 | 1.0000 |
20048 | 0.8489 |
20049 | 1.0000 |
18836 rows × 1 columns
Example Usage¶
In [ ]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# Convert data into DMatrix format, which is the format that XGBoost expects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Define XGBoost parameters
params = {
'objective': 'multi:softmax', # Specify the objective for multi-class classification
'num_class': len(np.unique(y_train)), # Number of classes
'max_depth': 3, # Maximum tree depth
'eta': 0.1, # Learning rate
'subsample': 0.8, # Fraction of samples used for training each tree
'colsample_bytree': 0.8, # Fraction of features used for training each tree
'eval_metric': 'mlogloss' # Evaluation metric
}
# Train the XGBoost model
num_round = 100 # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)
# Make predictions on the test set
y_pred = bst.predict(dtest)
In [ ]:
# Calculate accuracy
accuracy = accuracy_score(y_test.tolist(), y_pred.tolist())
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 1.00
In [ ]:
# Generate the classification report
report = classification_report(y_test.tolist(), y_pred.tolist())
# Print the classification report
print("XGBoost Classification Report:\n", report)
XGBoost Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 2585 1 1.00 1.00 1.00 1183 accuracy 1.00 3768 macro avg 1.00 1.00 1.00 3768 weighted avg 1.00 1.00 1.00 3768
In [ ]:
from sklearn.naive_bayes import GaussianNB
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# change y_train, y_test dtype from object to int64
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)
# Create a Naive Bayes classifier (Gaussian Naive Bayes)
nb_classifier = GaussianNB()
# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = nb_classifier.predict(X_test)
In [ ]:
# Generate the classification report
report = classification_report(y_test, y_pred)
# Print the classification report
print("Naive Bayes Classification Report:\n", report)
Naive Bayes Classification Report: precision recall f1-score support 0 0.69 1.00 0.81 2585 1 0.00 0.00 0.00 1183 accuracy 0.69 3768 macro avg 0.34 0.50 0.41 3768 weighted avg 0.47 0.69 0.56 3768
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.frequent_patterns import apriori, association_rules
# Load the dataset
data = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1') # Replace with actual file path
# Step 1: Data Cleaning
# Dropping columns with too many missing values or irrelevant ones
data_cleaned = data.drop(['_unit_id', '_last_judgment_at', 'profileimage', 'tweet_id', 'tweet_created'], axis=1)
# Fill missing values (example: fill numerical columns with mean, categorical with mode)
data_cleaned['fav_number'].fillna(data_cleaned['fav_number'].mean(), inplace=True)
data_cleaned['gender'].fillna(data_cleaned['gender'].mode()[0], inplace=True)
data_cleaned['tweet_location'].fillna('Unknown', inplace=True)
# Step 2: Handle Categorical Variables
# Convert categorical columns to numeric using OneHotEncoding or get_dummies
categorical_columns = ['_unit_state', 'gender', 'profile_yn', 'tweet_location'] # Add other categorical columns if necessary
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)
# Step 3: Define Target and Features
# Target: 'fav_number' (example), Features: all other columns
X = data_cleaned.drop('fav_number', axis=1)
y = data_cleaned['fav_number']
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 5: Scale the Data (Now that all features are numeric)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 6: Fit Ridge Regression Model
ridge_reg = Ridge(alpha=1.0) # Alpha is the regularization strength
ridge_reg.fit(X_train_scaled, y_train)
# Step 7: Predict and Evaluate
y_pred = ridge_reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
# Step 8: Apriori Algorithm (Assuming we are interested in user behavior patterns)
# For apriori, we need to transform data into boolean values (e.g., presence of an attribute)
transactions = data_cleaned[['retweet_count', 'profile_yn_True', 'tweet_location_Unknown']] > 0
# Apply the apriori algorithm
frequent_itemsets = apriori(transactions, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(frequent_itemsets)
print(rules)
/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-6-5222a9351d56> in <cell line: 36>() 34 # Step 5: Scale the Data (Now that all features are numeric) 35 scaler = StandardScaler() ---> 36 X_train_scaled = scaler.fit_transform(X_train) 37 X_test_scaled = scaler.transform(X_test) 38 /usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs) 155 @wraps(f) 156 def wrapped(self, X, *args, **kwargs): --> 157 data_to_wrap = f(self, X, *args, **kwargs) 158 if isinstance(data_to_wrap, tuple): 159 # only wrap the first output for cross decomposition /usr/local/lib/python3.10/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 914 if y is None: 915 # fit method of arity 1 (unsupervised transformation) --> 916 return self.fit(X, **fit_params).transform(X) 917 else: 918 # fit method of arity 2 (supervised transformation) /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in fit(self, X, y, sample_weight) 837 # Reset internal state before fitting 838 self._reset() --> 839 return self.partial_fit(X, y, sample_weight) 840 841 @_fit_context(prefer_skip_nested_validation=True) /usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs) 1150 ) 1151 ): -> 1152 return fit_method(estimator, *args, **kwargs) 1153 1154 return wrapper /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y, sample_weight) 873 """ 874 first_call = not hasattr(self, "n_samples_seen_") --> 875 X = self._validate_data( 876 X, 877 accept_sparse=("csr", "csc"), /usr/local/lib/python3.10/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params) 603 out = X, y 604 elif not no_val_X and no_val_y: --> 605 out = check_array(X, input_name="X", **check_params) 606 elif no_val_X and not no_val_y: 607 out = _check_y(y, **check_params) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 834 # Use the original dtype for conversion if dtype is None 835 new_dtype = dtype_orig if dtype is None else dtype --> 836 array = array.astype(new_dtype) 837 # Since we converted here, we do not need to convert again later 838 dtype = None /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 6532 else: 6533 # else, only a single dtype is given -> 6534 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6535 res = self._constructor_from_mgr(new_data, axes=new_data.axes) 6536 return res.__finalize__(self, method="astype") /usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 412 copy = False 413 --> 414 return self.apply( 415 "astype", 416 dtype=dtype, /usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs) 352 applied = b.apply(f, **kwargs) 353 else: --> 354 applied = getattr(b, f)(**kwargs) 355 result_blocks = extend_blocks(applied, result_blocks) 356 /usr/local/lib/python3.10/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, using_cow) 614 values = self.values 615 --> 616 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 617 618 new_values = maybe_coerce_values(new_values) /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array_safe(values, dtype, copy, errors) 236 237 try: --> 238 new_values = astype_array(values, dtype, copy=copy) 239 except (ValueError, TypeError): 240 # e.g. _astype_nansafe can fail on object-dtype of strings /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array(values, dtype, copy) 181 182 else: --> 183 values = _astype_nansafe(values, dtype, copy=copy) 184 185 # in pandas we don't store numpy str dtypes, so convert to object /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in _astype_nansafe(arr, dtype, copy, skipna) 132 if copy or arr.dtype == object or dtype == object: 133 # Explicit copy, or required since NumPy can't view from / to object. --> 134 return arr.astype(dtype, copy=True) 135 136 return arr.astype(dtype, copy=copy) ValueError: could not convert string to float: '2/7/14 11:55'
In [ ]:
_