import pandas as pd
import numpy as np
import seaborn as sns
import zipfile as zp
import os
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
# Read the data
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')
# Display the data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20050 entries, 0 to 20049 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 20050 non-null int64 1 _golden 20050 non-null bool 2 _unit_state 20050 non-null object 3 _trusted_judgments 20050 non-null int64 4 _last_judgment_at 20000 non-null object 5 gender 19953 non-null object 6 gender:confidence 20024 non-null float64 7 profile_yn 20050 non-null object 8 profile_yn:confidence 20050 non-null float64 9 created 20050 non-null object 10 description 16306 non-null object 11 fav_number 20050 non-null int64 12 gender_gold 50 non-null object 13 link_color 20050 non-null object 14 name 20050 non-null object 15 profile_yn_gold 50 non-null object 16 profileimage 20050 non-null object 17 retweet_count 20050 non-null int64 18 sidebar_color 20050 non-null object 19 text 20050 non-null object 20 tweet_coord 159 non-null object 21 tweet_count 20050 non-null int64 22 tweet_created 20050 non-null object 23 tweet_id 20050 non-null float64 24 tweet_location 12565 non-null object 25 user_timezone 12252 non-null object dtypes: bool(1), float64(3), int64(5), object(17) memory usage: 3.8+ MB
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn | profile_yn:confidence | created | ... | profileimage | retweet_count | sidebar_color | text | tweet_coord | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | |
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | male | 1.0000 | yes | 1.0 | 12/5/13 1:48 | ... | | 0 | FFFFFF | Robbie E Responds To Critics After Win Against... | NaN | 110964 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | male | 1.0000 | yes | 1.0 | 10/1/12 13:51 | ... | | 0 | C0DEED | ÛÏIt felt like they were my friends and I was... | NaN | 7471 | 10/26/15 12:40 | 6.587300e+17 | NaN | Eastern Time (US & Canada) |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | male | 0.6625 | yes | 1.0 | 11/28/14 11:30 | ... | | 1 | C0DEED | i absolutely adore when louis starts the songs... | NaN | 5617 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | male | 1.0000 | yes | 1.0 | 6/11/09 22:39 | ... | | 0 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | NaN | 1693 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | female | 1.0000 | yes | 1.0 | 4/16/14 13:23 | ... | | 0 | 0 | Watching Neighbours on Sky+ catching up with t... | NaN | 31462 | 10/26/15 12:40 | 6.587300e+17 | NaN | NaN |
5 rows × 26 columns
Handling Missing Data¶
# Dropping columns with more than 90% missing values
df_cleaned = df.drop(columns=['gender_gold', 'profile_yn_gold', 'tweet_coord'])
# Filling missing values in 'description', 'user_timezone', and 'tweet_location' with a placeholder 'Unknown'
df_cleaned['description'].fillna('Unknown', inplace=True)
df_cleaned['tweet_location'].fillna('Unknown', inplace=True)
# Dropping rows where 'gender' is missing (as it's a small percentage of rows with missing data)
df_cleaned = df_cleaned.dropna(subset=['gender'])
# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])
# Now that we have handled the missing data, you can proceed with further analysis # Display the structure of the cleaned dataset
df_cleaned.head() # Display the first few rows of the cleaned dataset
<class 'pandas.core.frame.DataFrame'> Index: 19953 entries, 0 to 20049 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _unit_id 19953 non-null int64 1 _golden 19953 non-null bool 2 _unit_state 19953 non-null object 3 _trusted_judgments 19953 non-null int64 4 _last_judgment_at 19903 non-null object 5 gender 19953 non-null object 6 gender:confidence 19953 non-null float64 7 profile_yn:confidence 19953 non-null float64 8 created 19953 non-null object 9 description 19953 non-null object 10 fav_number 19953 non-null int64 11 link_color 19953 non-null object 12 name 19953 non-null object 13 profileimage 19953 non-null object 14 retweet_count 19953 non-null int64 15 sidebar_color 19953 non-null object 16 text 19953 non-null object 17 tweet_count 19953 non-null int64 18 tweet_created 19953 non-null object 19 tweet_id 19953 non-null float64 20 tweet_location 19953 non-null object 21 user_timezone 12185 non-null object dtypes: bool(1), float64(3), int64(5), object(13) memory usage: 3.4+ MB
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn:confidence | created | description | ... | name | profileimage | retweet_count | sidebar_color | text | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | |
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | male | 1.0000 | 1.0 | 12/5/13 1:48 | i sing my own rhythm. | ... | sheezy0 | | 0 | FFFFFF | Robbie E Responds To Critics After Win Against... | 110964 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | male | 1.0000 | 1.0 | 10/1/12 13:51 | I'm the author of novels filled with family dr... | ... | DavdBurnett | | 0 | C0DEED | ÛÏIt felt like they were my friends and I was... | 7471 | 10/26/15 12:40 | 6.587300e+17 | Unknown | Eastern Time (US & Canada) |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | male | 0.6625 | 1.0 | 11/28/14 11:30 | louis whining and squealing and all | ... | lwtprettylaugh | | 1 | C0DEED | i absolutely adore when louis starts the songs... | 5617 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | male | 1.0000 | 1.0 | 6/11/09 22:39 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | ... | douggarland | | 0 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | 1693 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | female | 1.0000 | 1.0 | 4/16/14 13:23 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | ... | WilfordGemma | | 0 | 0 | Watching Neighbours on Sky+ catching up with t... | 31462 | 10/26/15 12:40 | 6.587300e+17 | Unknown | NaN |
5 rows × 22 columns
Exploratory Data Analysis (EDA)¶
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
# Distribution of tweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['tweet_count'], kde=True, bins=30)
plt.title('Distribution of Tweet Count')
plt.xlabel('Tweet Count')
# Distribution of retweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['retweet_count'], kde=True, bins=30)
plt.title('Distribution of Retweet Count')
plt.xlabel('Retweet Count')
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
<ipython-input-9-329074fae944>:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year <ipython-input-9-329074fae944>:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year
# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
# Exploring 'link_color' and 'sidebar_color' features
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Profile Link Colors')
plt.ylabel('Link Color')
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Sidebar Colors')
plt.ylabel('Sidebar Color')
df_cleaned = df_cleaned[df_cleaned['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])
# change male=>0, female=>1, brand=>2
df_cleaned.loc[df['gender'] == 'male', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'female', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
<ipython-input-12-ed2064a24a69>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | gender | gender:confidence | profile_yn:confidence | created | description | ... | retweet_count | sidebar_color | text | tweet_count | tweet_created | tweet_id | tweet_location | user_timezone | profile_created_year | tweet_created_year | |
0 | 815719226 | False | finalized | 3 | 10/26/15 23:24 | 0 | 1.0000 | 1.0 | 12/5/13 1:48 | i sing my own rhythm. | ... | -0.030196 | FFFFFF | Robbie E Responds To Critics After Win Against... | 0.602953 | 10/26/15 12:40 | 6.587300e+17 | main; @Kan1shk3 | Chennai | 2013 | 2015 |
1 | 815719227 | False | finalized | 3 | 10/26/15 23:30 | 0 | 1.0000 | 1.0 | 10/1/12 13:51 | I'm the author of novels filled with family dr... | ... | -0.030196 | C0DEED | ÛÏIt felt like they were my friends and I was... | -0.265805 | 10/26/15 12:40 | 6.587300e+17 | Unknown | Eastern Time (US & Canada) | 2012 | 2015 |
2 | 815719228 | False | finalized | 3 | 10/26/15 23:33 | 0 | 0.6625 | 1.0 | 11/28/14 11:30 | louis whining and squealing and all | ... | 0.335804 | C0DEED | i absolutely adore when louis starts the songs... | -0.281368 | 10/26/15 12:40 | 6.587300e+17 | clcncl | Belgrade | 2014 | 2015 |
3 | 815719229 | False | finalized | 3 | 10/26/15 23:10 | 0 | 1.0000 | 1.0 | 6/11/09 22:39 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | ... | -0.030196 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | -0.314308 | 10/26/15 12:40 | 6.587300e+17 | Palo Alto, CA | Pacific Time (US & Canada) | 2009 | 2015 |
4 | 815719230 | False | finalized | 3 | 10/27/15 1:15 | 0 | 1.0000 | 1.0 | 4/16/14 13:23 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | ... | -0.030196 | 0 | Watching Neighbours on Sky+ catching up with t... | -0.064416 | 10/26/15 12:40 | 6.587300e+17 | Unknown | NaN | 2014 | 2015 |
5 rows × 24 columns
import pandas as pd
# First, encode the 'gender' column to numeric values
df_cleaned['gender'] = df_cleaned['gender'].replace({'male': 0, 'female': 1, 'brand': 2})
# Select numerical columns
numerical_columns = ['gender:confidence', 'profile_yn:confidence', 'fav_number', 'retweet_count', 'tweet_count', 'tweet_id']
# Calculate the Pearson correlation with the target variable 'gender'
correlations = df_cleaned[numerical_columns].corrwith(df_cleaned['gender'])
print("Correlations with target (gender):")
Correlations with target (gender): gender:confidence -0.129078 profile_yn:confidence -0.007602 fav_number -0.125455 retweet_count 0.008353 tweet_count 0.119731 tweet_id -0.122541 dtype: float64
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
# List of categorical columns
categorical_columns = ['_unit_state', 'link_color', 'sidebar_color', 'tweet_location', 'user_timezone'] # Add other categorical columns if necessary
# Encode the categorical variables
label_encoder = LabelEncoder()
df_encoded = df_cleaned.copy()
for col in categorical_columns:
df_encoded[col] = label_encoder.fit_transform(df_encoded[col].astype(str))
# Apply chi-squared test
X = df_encoded[categorical_columns]
y = df_encoded['gender']
chi_scores = chi2(X, y)
print("Chi-square scores:", chi_scores)
Chi-square scores: (array([2.91137794e-01, 2.91763576e+05, 4.20948757e+02, 4.20571655e+04, 5.80444260e+03]), array([5.89492326e-01, 0.00000000e+00, 1.51683635e-93, 0.00000000e+00, 0.00000000e+00]))
import seaborn as sns
import matplotlib.pyplot as plt
# Compute the correlation matrix for numerical columns including 'gender'
corr_matrix = df_cleaned[numerical_columns + ['gender']].corr()
# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features and Target")
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# drop columns that are irrelevant
col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
'retweet_count', 'sidebar_color', 'text', 'tweet_count', 'tweet_id', 'tweet_location', 'user_timezone'
df_preprocessed = df_cleaned[col]
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']])
# List of categorical columns
categorical_columns = ['link_color', 'sidebar_color', 'tweet_location', 'user_timezone']
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(drop='first', sparse=False) # drop='first' prevents the dummy variable trap (removes first category)
# Apply OneHotEncoder using ColumnTransformer
preprocessor = ColumnTransformer(
('cat', one_hot_encoder, categorical_columns)
remainder='passthrough' # Keeps the rest of the columns as they are (numerical features)
# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df_preprocessed)
# Convert the transformed array back to a DataFrame and retain column names
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)
# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/ FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
--------------------------------------------------------------------------- NotFittedError Traceback (most recent call last) <ipython-input-16-a1910b6b304b> in <cell line: 38>() 36 37 # Convert the transformed array back to a DataFrame and retain column names ---> 38 encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns) 39 df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names) 40 /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/ in get_feature_names_out(self, input_features) 1208 Transformed feature names. 1209 """ -> 1210 check_is_fitted(self) 1211 input_features = _check_feature_names_in(self, input_features) 1212 cats = [ /usr/local/lib/python3.10/dist-packages/sklearn/utils/ in check_is_fitted(estimator, attributes, msg, all_or_any) 1459 1460 if not _is_fitted(estimator, attributes, all_or_any): -> 1461 raise NotFittedError(msg % {"name": type(estimator).__name__}) 1462 1463 NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# drop columns that are irrelevant
col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
'retweet_count', 'sidebar_color', 'text', 'tweet_count','tweet_id'
df_preprocessed = df_cleaned[col]
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']
# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']])
# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1
# Check the first few rows of the preprocessed data
gender | gender:confidence | description | fav_number | link_color | retweet_count | sidebar_color | text | tweet_count | tweet_id | |
0 | 0 | 1.0000 | i sing my own rhythm. | -0.353977 | 08C2C2 | -0.030196 | FFFFFF | Robbie E Responds To Critics After Win Against... | 0.602953 | 6.587300e+17 |
1 | 0 | 1.0000 | I'm the author of novels filled with family dr... | -0.348524 | 0084B4 | -0.030196 | C0DEED | ÛÏIt felt like they were my friends and I was... | -0.265805 | 6.587300e+17 |
2 | 0 | 0.6625 | louis whining and squealing and all | 0.263273 | ABB8C2 | 0.335804 | C0DEED | i absolutely adore when louis starts the songs... | -0.281368 | 6.587300e+17 |
3 | 0 | 1.0000 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... | -0.337776 | 0084B4 | -0.030196 | C0DEED | Hi @JordanSpieth - Looking at the url - do you... | -0.314308 | 6.587300e+17 |
4 | 0 | 1.0000 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... | 2.639077 | 3B94D9 | -0.030196 | 0 | Watching Neighbours on Sky+ catching up with t... | -0.064416 | 6.587300e+17 |
# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
# The imbalanceness can be handled either using the model attribute class_weight or applying sampling techniques.
NLP Processing
import nltk
from nltk.corpus import stopwords'stopwords')'punkt')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/ [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/
Out[ ]:
df_status = df_preprocessed.copy()
df_status = pd.concat([df_status['gender'], df_status['description']], axis=1)
gender | description | |
0 | 0 | i sing my own rhythm. |
1 | 0 | I'm the author of novels filled with family dr... |
2 | 0 | louis whining and squealing and all |
3 | 0 | Mobile guy. 49ers, Shazam, Google, Kleiner Pe... |
4 | 0 | Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T... |
... | ... | ... |
20045 | 0 | (rp) |
20046 | 0 | Whatever you like, it's not a problem at all. ... |
20047 | 0 | #TeamBarcelona ..You look lost so you should f... |
20048 | 0 | Anti-statist; I homeschool my kids. Aspiring t... |
20049 | 0 | Teamwork makes the dream work. |
18836 rows × 2 columns
# make all lowercase since "Run" is not the same as "run" for machine computation
import re
description = []
for x in df_status['description']:
desc = re.sub("[^a-zA-Z]"," ",x)
desc = desc.lower()
df_status['description'] = description
gender | description | |
0 | 0 | i sing my own rhythm |
1 | 0 | i m the author of novels filled with family dr... |
2 | 0 | louis whining and squealing and all |
3 | 0 | mobile guy ers shazam google kleiner pe... |
4 | 0 | ricky wilson the best frontman kaiser chiefs t... |
... | ... | ... |
20045 | 0 | rp |
20046 | 0 | whatever you like it s not a problem at all ... |
20047 | 0 | teambarcelona you look lost so you should f... |
20048 | 0 | anti statist i homeschool my kids aspiring t... |
20049 | 0 | teamwork makes the dream work |
18836 rows × 2 columns
# remove stopwords in sentence ==> i,a,the,an,and,.,me,........
def remove_stopwords(text):
words = nltk.word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
return filtered_words
df_status['tokenized'] = df_status['description'].apply(remove_stopwords)
gender | description | tokenized | |
0 | 0 | i sing my own rhythm | [sing, rhythm] |
1 | 0 | i m the author of novels filled with family dr... | [author, novels, filled, family, drama, romance] |
2 | 0 | louis whining and squealing and all | [louis, whining, squealing] |
3 | 0 | mobile guy ers shazam google kleiner pe... | [mobile, guy, ers, shazam, google, kleiner, pe... |
4 | 0 | ricky wilson the best frontman kaiser chiefs t... | [ricky, wilson, best, frontman, kaiser, chiefs... |
... | ... | ... | ... |
20045 | 0 | rp | [rp] |
20046 | 0 | whatever you like it s not a problem at all ... | [whatever, like, problem, chargernation, forev... |
20047 | 0 | teambarcelona you look lost so you should f... | [teambarcelona, look, lost, follow, follow, he... |
20048 | 0 | anti statist i homeschool my kids aspiring t... | [anti, statist, homeschool, kids, aspiring, th... |
20049 | 0 | teamwork makes the dream work | [teamwork, makes, dream, work] |
18836 rows × 3 columns
# count word in sentence by changing tokenized to vectorizor (for machine compute)
# CountVectorizer input must be string with one long list
from sklearn.feature_extraction.text import CountVectorizer
max_features = 1500
corpus = [' '.join(words) for words in df_status['tokenized']]
vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")
X = vectorizer.fit_transform(corpus).toarray()
# let's see X in dataframe
df_ = pd.DataFrame(X, columns=vectorizer.get_feature_names_out(), index=df_status.index)
academy | account | achieve | act | action | active | activist | actor | actress | actually | ... | yes | yo | yoga | york | young | youth | youtube | youtuber | yrs | zayn | |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20045 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20046 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20048 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20049 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
18836 rows × 1500 columns
y = df_preprocessed['gender'].values # Create an array
y # gender ==> our target in the model
array([0, 0, 0, ..., 0, 0, 0])
# Now drop the processed columns ('description', 'text', and categorical) from the original dataset
df_preprocessed = df_preprocessed.drop(columns=['description', 'text', 'link_color', 'sidebar_color'])
df_preprocessed_X = df_preprocessed.drop(columns=['gender', 'gender:confidence'])
# Combine the text features with the other preprocessed features
X_combined = np.hstack((df_preprocessed_X.values, X))
gender | gender:confidence | fav_number | retweet_count | tweet_count | tweet_id | |
0 | 0 | 1.0000 | -0.353977 | -0.030196 | 0.602953 | 6.587300e+17 |
1 | 0 | 1.0000 | -0.348524 | -0.030196 | -0.265805 | 6.587300e+17 |
2 | 0 | 0.6625 | 0.263273 | 0.335804 | -0.281368 | 6.587300e+17 |
3 | 0 | 1.0000 | -0.337776 | -0.030196 | -0.314308 | 6.587300e+17 |
4 | 0 | 1.0000 | 2.639077 | -0.030196 | -0.064416 | 6.587300e+17 |
Regression Tasks¶
The choosen regression model will be a gradient boosted regression decision tree.
y = df_preprocessed["gender:confidence"]
from sklearn.ensemble import GradientBoostingRegressor
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model, y_train)
GradientBoostingRegressor(n_estimators=10, random_state=42)
GradientBoostingRegressor(n_estimators=10, random_state=42)
GradientBoostingRegressor(n_estimators=10, random_state=42)
from sklearn.metrics import mean_squared_error
# Make predictions
y_pred = boosted_reg.predict(X_test)
# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.027876325391586427
y_tot_pred = boosted_reg.predict(X_combined)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")
Mean Squared Error: 0.02896527615100315
array([0.31033774, 0. , 0.09989529, ..., 0. , 0. , 0. ])
Find the rows with the largest difference in gender confidence
gender:confidence | |
0 | 1.0000 |
1 | 1.0000 |
2 | 0.6625 |
3 | 1.0000 |
4 | 1.0000 |
... | ... |
20045 | 1.0000 |
20046 | 1.0000 |
20047 | 1.0000 |
20048 | 0.8489 |
20049 | 1.0000 |
18836 rows × 1 columns
Example Usage¶
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# Convert data into DMatrix format, which is the format that XGBoost expects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Define XGBoost parameters
params = {
'objective': 'multi:softmax', # Specify the objective for multi-class classification
'num_class': len(np.unique(y_train)), # Number of classes
'max_depth': 3, # Maximum tree depth
'eta': 0.1, # Learning rate
'subsample': 0.8, # Fraction of samples used for training each tree
'colsample_bytree': 0.8, # Fraction of features used for training each tree
'eval_metric': 'mlogloss' # Evaluation metric
# Train the XGBoost model
num_round = 100 # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)
# Make predictions on the test set
y_pred = bst.predict(dtest)
# Calculate accuracy
accuracy = accuracy_score(y_test.tolist(), y_pred.tolist())
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 1.00
# Generate the classification report
report = classification_report(y_test.tolist(), y_pred.tolist())
# Print the classification report
print("XGBoost Classification Report:\n", report)
XGBoost Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 2585 1 1.00 1.00 1.00 1183 accuracy 1.00 3768 macro avg 1.00 1.00 1.00 3768 weighted avg 1.00 1.00 1.00 3768
from sklearn.naive_bayes import GaussianNB
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# change y_train, y_test dtype from object to int64
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)
# Create a Naive Bayes classifier (Gaussian Naive Bayes)
nb_classifier = GaussianNB()
# Fit the classifier to the training data, y_train)
# Make predictions on the test data
y_pred = nb_classifier.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred)
# Print the classification report
print("Naive Bayes Classification Report:\n", report)
Naive Bayes Classification Report: precision recall f1-score support 0 0.69 1.00 0.81 2585 1 0.00 0.00 0.00 1183 accuracy 0.69 3768 macro avg 0.34 0.50 0.41 3768 weighted avg 0.47 0.69 0.56 3768
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/ UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/ UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/ UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.frequent_patterns import apriori, association_rules
# Load the dataset
data = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1') # Replace with actual file path
# Step 1: Data Cleaning
# Dropping columns with too many missing values or irrelevant ones
data_cleaned = data.drop(['_unit_id', '_last_judgment_at', 'profileimage', 'tweet_id', 'tweet_created'], axis=1)
# Fill missing values (example: fill numerical columns with mean, categorical with mode)
data_cleaned['fav_number'].fillna(data_cleaned['fav_number'].mean(), inplace=True)
data_cleaned['gender'].fillna(data_cleaned['gender'].mode()[0], inplace=True)
data_cleaned['tweet_location'].fillna('Unknown', inplace=True)
# Step 2: Handle Categorical Variables
# Convert categorical columns to numeric using OneHotEncoding or get_dummies
categorical_columns = ['_unit_state', 'gender', 'profile_yn', 'tweet_location'] # Add other categorical columns if necessary
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)
# Step 3: Define Target and Features
# Target: 'fav_number' (example), Features: all other columns
X = data_cleaned.drop('fav_number', axis=1)
y = data_cleaned['fav_number']
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 5: Scale the Data (Now that all features are numeric)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 6: Fit Ridge Regression Model
ridge_reg = Ridge(alpha=1.0) # Alpha is the regularization strength, y_train)
# Step 7: Predict and Evaluate
y_pred = ridge_reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
# Step 8: Apriori Algorithm (Assuming we are interested in user behavior patterns)
# For apriori, we need to transform data into boolean values (e.g., presence of an attribute)
transactions = data_cleaned[['retweet_count', 'profile_yn_True', 'tweet_location_Unknown']] > 0
# Apply the apriori algorithm
frequent_itemsets = apriori(transactions, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
/usr/local/lib/python3.10/dist-packages/ipykernel/ DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-6-5222a9351d56> in <cell line: 36>() 34 # Step 5: Scale the Data (Now that all features are numeric) 35 scaler = StandardScaler() ---> 36 X_train_scaled = scaler.fit_transform(X_train) 37 X_test_scaled = scaler.transform(X_test) 38 /usr/local/lib/python3.10/dist-packages/sklearn/utils/ in wrapped(self, X, *args, **kwargs) 155 @wraps(f) 156 def wrapped(self, X, *args, **kwargs): --> 157 data_to_wrap = f(self, X, *args, **kwargs) 158 if isinstance(data_to_wrap, tuple): 159 # only wrap the first output for cross decomposition /usr/local/lib/python3.10/dist-packages/sklearn/ in fit_transform(self, X, y, **fit_params) 914 if y is None: 915 # fit method of arity 1 (unsupervised transformation) --> 916 return, **fit_params).transform(X) 917 else: 918 # fit method of arity 2 (supervised transformation) /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/ in fit(self, X, y, sample_weight) 837 # Reset internal state before fitting 838 self._reset() --> 839 return self.partial_fit(X, y, sample_weight) 840 841 @_fit_context(prefer_skip_nested_validation=True) /usr/local/lib/python3.10/dist-packages/sklearn/ in wrapper(estimator, *args, **kwargs) 1150 ) 1151 ): -> 1152 return fit_method(estimator, *args, **kwargs) 1153 1154 return wrapper /usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/ in partial_fit(self, X, y, sample_weight) 873 """ 874 first_call = not hasattr(self, "n_samples_seen_") --> 875 X = self._validate_data( 876 X, 877 accept_sparse=("csr", "csc"), /usr/local/lib/python3.10/dist-packages/sklearn/ in _validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params) 603 out = X, y 604 elif not no_val_X and no_val_y: --> 605 out = check_array(X, input_name="X", **check_params) 606 elif no_val_X and not no_val_y: 607 out = _check_y(y, **check_params) /usr/local/lib/python3.10/dist-packages/sklearn/utils/ in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 834 # Use the original dtype for conversion if dtype is None 835 new_dtype = dtype_orig if dtype is None else dtype --> 836 array = array.astype(new_dtype) 837 # Since we converted here, we do not need to convert again later 838 dtype = None /usr/local/lib/python3.10/dist-packages/pandas/core/ in astype(self, dtype, copy, errors) 6532 else: 6533 # else, only a single dtype is given -> 6534 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6535 res = self._constructor_from_mgr(new_data, axes=new_data.axes) 6536 return res.__finalize__(self, method="astype") /usr/local/lib/python3.10/dist-packages/pandas/core/internals/ in astype(self, dtype, copy, errors) 412 copy = False 413 --> 414 return self.apply( 415 "astype", 416 dtype=dtype, /usr/local/lib/python3.10/dist-packages/pandas/core/internals/ in apply(self, f, align_keys, **kwargs) 352 applied = b.apply(f, **kwargs) 353 else: --> 354 applied = getattr(b, f)(**kwargs) 355 result_blocks = extend_blocks(applied, result_blocks) 356 /usr/local/lib/python3.10/dist-packages/pandas/core/internals/ in astype(self, dtype, copy, errors, using_cow) 614 values = self.values 615 --> 616 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 617 618 new_values = maybe_coerce_values(new_values) /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/ in astype_array_safe(values, dtype, copy, errors) 236 237 try: --> 238 new_values = astype_array(values, dtype, copy=copy) 239 except (ValueError, TypeError): 240 # e.g. _astype_nansafe can fail on object-dtype of strings /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/ in astype_array(values, dtype, copy) 181 182 else: --> 183 values = _astype_nansafe(values, dtype, copy=copy) 184 185 # in pandas we don't store numpy str dtypes, so convert to object /usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/ in _astype_nansafe(arr, dtype, copy, skipna) 132 if copy or arr.dtype == object or dtype == object: 133 # Explicit copy, or required since NumPy can't view from / to object. --> 134 return arr.astype(dtype, copy=True) 135 136 return arr.astype(dtype, copy=copy) ValueError: could not convert string to float: '2/7/14 11:55'
In [ ]: