import pandas as pd
import numpy as np
import seaborn as sns
import zipfile as zp
import os
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

# Read the data
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')

# Display the data
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  profile_yn_gold        50 non-null     object 
 16  profileimage           20050 non-null  object 
 17  retweet_count          20050 non-null  int64  
 18  sidebar_color          20050 non-null  object 
 19  text                   20050 non-null  object 
 20  tweet_coord            159 non-null    object 
 21  tweet_count            20050 non-null  int64  
 22  tweet_created          20050 non-null  object 
 23  tweet_id               20050 non-null  float64
 24  tweet_location         12565 non-null  object 
 25  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB

# Dropping columns with more than 90% missing values
df_cleaned = df.drop(columns=['gender_gold', 'profile_yn_gold', 'tweet_coord'])

# Filling missing values in 'description', 'user_timezone', and 'tweet_location' with a placeholder 'Unknown'
df_cleaned['description'].fillna('Unknown', inplace=True)

df_cleaned['tweet_location'].fillna('Unknown', inplace=True)

# Dropping rows where 'gender' is missing (as it's a small percentage of rows with missing data)
df_cleaned = df_cleaned.dropna(subset=['gender'])

# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])

# Now that we have handled the missing data, you can proceed with further analysis
df_cleaned.info()  # Display the structure of the cleaned dataset
df_cleaned.head()  # Display the first few rows of the cleaned dataset

<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               19953 non-null  int64  
 1   _golden                19953 non-null  bool   
 2   _unit_state            19953 non-null  object 
 3   _trusted_judgments     19953 non-null  int64  
 4   _last_judgment_at      19903 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      19953 non-null  float64
 7   profile_yn:confidence  19953 non-null  float64
 8   created                19953 non-null  object 
 9   description            19953 non-null  object 
 10  fav_number             19953 non-null  int64  
 11  link_color             19953 non-null  object 
 12  name                   19953 non-null  object 
 13  profileimage           19953 non-null  object 
 14  retweet_count          19953 non-null  int64  
 15  sidebar_color          19953 non-null  object 
 16  text                   19953 non-null  object 
 17  tweet_count            19953 non-null  int64  
 18  tweet_created          19953 non-null  object 
 19  tweet_id               19953 non-null  float64
 20  tweet_location         19953 non-null  object 
 21  user_timezone          12185 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# Distribution of tweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['tweet_count'], kde=True, bins=30)
plt.title('Distribution of Tweet Count')
plt.xlabel('Tweet Count')
plt.ylabel('Density')
plt.show()

# Distribution of retweet count
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['retweet_count'], kde=True, bins=30)
plt.title('Distribution of Retweet Count')
plt.xlabel('Retweet Count')
plt.ylabel('Density')
plt.show()

# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year

<ipython-input-9-329074fae944>:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
<ipython-input-9-329074fae944>:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year

# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('Count')
plt.show()

# Exploring 'link_color' and 'sidebar_color' features
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Profile Link Colors')
plt.ylabel('Link Color')
plt.xlabel('Count')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:10].index)
plt.title('Top 10 Most Common Sidebar Colors')
plt.ylabel('Sidebar Color')
plt.xlabel('Count')
plt.show()

df_cleaned = df_cleaned[df_cleaned['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])

# change male=>0, female=>1, brand=>2
df_cleaned.loc[df['gender'] == 'male', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'female', 'gender'] = 0
df_cleaned.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_cleaned.head()

<ipython-input-12-ed2064a24a69>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']])

import pandas as pd

# First, encode the 'gender' column to numeric values
df_cleaned['gender'] = df_cleaned['gender'].replace({'male': 0, 'female': 1, 'brand': 2})

# Select numerical columns
numerical_columns = ['gender:confidence', 'profile_yn:confidence', 'fav_number', 'retweet_count', 'tweet_count', 'tweet_id']

# Calculate the Pearson correlation with the target variable 'gender'
correlations = df_cleaned[numerical_columns].corrwith(df_cleaned['gender'])

print("Correlations with target (gender):")
print(correlations)

Correlations with target (gender):
gender:confidence       -0.129078
profile_yn:confidence   -0.007602
fav_number              -0.125455
retweet_count            0.008353
tweet_count              0.119731
tweet_id                -0.122541
dtype: float64

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

# List of categorical columns
categorical_columns = ['_unit_state', 'link_color', 'sidebar_color', 'tweet_location', 'user_timezone']  # Add other categorical columns if necessary

# Encode the categorical variables
label_encoder = LabelEncoder()
df_encoded = df_cleaned.copy()
for col in categorical_columns:
    df_encoded[col] = label_encoder.fit_transform(df_encoded[col].astype(str))

# Apply chi-squared test
X = df_encoded[categorical_columns]
y = df_encoded['gender']

chi_scores = chi2(X, y)

print("Chi-square scores:", chi_scores)

Chi-square scores: (array([2.91137794e-01, 2.91763576e+05, 4.20948757e+02, 4.20571655e+04,
       5.80444260e+03]), array([5.89492326e-01, 0.00000000e+00, 1.51683635e-93, 0.00000000e+00,
       0.00000000e+00]))

import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix for numerical columns including 'gender'
corr_matrix = df_cleaned[numerical_columns + ['gender']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features and Target")
plt.show()

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# drop columns that are irrelevant

col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
       'retweet_count', 'sidebar_color', 'text', 'tweet_count', 'tweet_id', 'tweet_location', 'user_timezone'
       ]

df_preprocessed = df_cleaned[col]

# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number', 'gender:confidence']])

# List of categorical columns
categorical_columns = ['link_color', 'sidebar_color', 'tweet_location', 'user_timezone']

# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' prevents the dummy variable trap (removes first category)

# Apply OneHotEncoder using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_columns)
    ],
    remainder='passthrough'  # Keeps the rest of the columns as they are (numerical features)
)

# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df_preprocessed)

# Convert the transformed array back to a DataFrame and retain column names
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)

# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_preprocessed.head()

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-16-a1910b6b304b> in <cell line: 38>()
     36 
     37 # Convert the transformed array back to a DataFrame and retain column names
---> 38 encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_columns)
     39 df_preprocessed = pd.DataFrame(df_preprocessed, columns=encoded_feature_names)
     40 

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py in get_feature_names_out(self, input_features)
   1208             Transformed feature names.
   1209         """
-> 1210         check_is_fitted(self)
   1211         input_features = _check_feature_names_in(self, input_features)
   1212         cats = [

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
   1459 
   1460     if not _is_fitted(estimator, attributes, all_or_any):
-> 1461         raise NotFittedError(msg % {"name": type(estimator).__name__})
   1462 
   1463 

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# drop columns that are irrelevant

col = ['gender', 'gender:confidence', 'description', 'fav_number','link_color',
       'retweet_count', 'sidebar_color', 'text', 'tweet_count','tweet_id'
       ]

df_preprocessed = df_cleaned[col]

# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Scaling numerical features
scaler = StandardScaler()
df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']] = scaler.fit_transform(df_preprocessed[['tweet_count', 'retweet_count', 'fav_number']])

# change male=>0, female=>1, brand=>2
df_preprocessed.loc[df['gender'] == 'male', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'female', 'gender'] = 0
df_preprocessed.loc[df['gender'] == 'brand', 'gender'] = 1

# Check the first few rows of the preprocessed data
df_preprocessed.head()

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# The imbalanceness can be handled either using the model attribute class_weight or applying sampling techniques.

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

True

df_status = df_preprocessed.copy()
df_status = pd.concat([df_status['gender'], df_status['description']], axis=1)

df_status

# make all lowercase since "Run" is not the same as "run" for machine computation
import re

description = []

for x in df_status['description']:
    desc = re.sub("[^a-zA-Z]"," ",x)
    desc = desc.lower()
    description.append(desc)

df_status['description'] = description
df_status

# remove stopwords in sentence ==> i,a,the,an,and,.,me,........
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words


df_status['tokenized'] = df_status['description'].apply(remove_stopwords)
df_status

# count word in sentence by changing tokenized to vectorizor (for machine compute)
# CountVectorizer input must be string with one long list

from sklearn.feature_extraction.text import CountVectorizer

max_features = 1500
corpus = [' '.join(words) for words in df_status['tokenized']]

vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")
X = vectorizer.fit_transform(corpus).toarray()

# let's see X in dataframe
df_ = pd.DataFrame(X, columns=vectorizer.get_feature_names_out(), index=df_status.index)

df_

y = df_preprocessed['gender'].values # Create an array
y # gender ==> our target in the model

array([0, 0, 0, ..., 0, 0, 0])

# Now drop the processed columns ('description', 'text', and categorical) from the original dataset
df_preprocessed = df_preprocessed.drop(columns=['description', 'text', 'link_color', 'sidebar_color'])

#WITH TARGET INFORMATION REMOVED
df_preprocessed_X = df_preprocessed.drop(columns=['gender', 'gender:confidence'])

# Combine the text features with the other preprocessed features
X_combined = np.hstack((df_preprocessed_X.values, X))

df_preprocessed.head()

y = df_preprocessed["gender:confidence"]

from sklearn.ensemble import GradientBoostingRegressor

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

boosted_reg = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=10, random_state=42)

GradientBoostingRegressor(n_estimators=10, random_state=42)

from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = boosted_reg.predict(X_test)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.027876325391586427

y_tot_pred = boosted_reg.predict(X_combined)
mse = mean_squared_error(y, y_tot_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.02896527615100315

boosted_reg.feature_importances_

array([0.31033774, 0.        , 0.09989529, ..., 0.        , 0.        ,
       0.        ])

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format, which is the format that XGBoost expects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'multi:softmax',  # Specify the objective for multi-class classification
    'num_class': len(np.unique(y_train)),  # Number of classes
    'max_depth': 3,  # Maximum tree depth
    'eta': 0.1,  # Learning rate
    'subsample': 0.8,  # Fraction of samples used for training each tree
    'colsample_bytree': 0.8,  # Fraction of features used for training each tree
    'eval_metric': 'mlogloss'  # Evaluation metric
}

# Train the XGBoost model
num_round = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
y_pred = bst.predict(dtest)

# Calculate accuracy
accuracy = accuracy_score(y_test.tolist(), y_pred.tolist())

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00

# Generate the classification report
report = classification_report(y_test.tolist(), y_pred.tolist())

# Print the classification report
print("XGBoost Classification Report:\n", report)

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2585
           1       1.00      1.00      1.00      1183

    accuracy                           1.00      3768
   macro avg       1.00      1.00      1.00      3768
weighted avg       1.00      1.00      1.00      3768

from sklearn.naive_bayes import GaussianNB

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# change y_train, y_test dtype from object to int64
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)

# Create a Naive Bayes classifier (Gaussian Naive Bayes)
nb_classifier = GaussianNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nb_classifier.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Naive Bayes Classification Report:\n", report)

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.69      1.00      0.81      2585
           1       0.00      0.00      0.00      1183

    accuracy                           0.69      3768
   macro avg       0.34      0.50      0.41      3768
weighted avg       0.47      0.69      0.56      3768

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
data = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')  # Replace with actual file path

# Step 1: Data Cleaning
# Dropping columns with too many missing values or irrelevant ones
data_cleaned = data.drop(['_unit_id', '_last_judgment_at', 'profileimage', 'tweet_id', 'tweet_created'], axis=1)

# Fill missing values (example: fill numerical columns with mean, categorical with mode)
data_cleaned['fav_number'].fillna(data_cleaned['fav_number'].mean(), inplace=True)
data_cleaned['gender'].fillna(data_cleaned['gender'].mode()[0], inplace=True)
data_cleaned['tweet_location'].fillna('Unknown', inplace=True)

# Step 2: Handle Categorical Variables
# Convert categorical columns to numeric using OneHotEncoding or get_dummies
categorical_columns = ['_unit_state', 'gender', 'profile_yn', 'tweet_location']  # Add other categorical columns if necessary
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

# Step 3: Define Target and Features
# Target: 'fav_number' (example), Features: all other columns
X = data_cleaned.drop('fav_number', axis=1)
y = data_cleaned['fav_number']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Scale the Data (Now that all features are numeric)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Fit Ridge Regression Model
ridge_reg = Ridge(alpha=1.0)  # Alpha is the regularization strength
ridge_reg.fit(X_train_scaled, y_train)

# Step 7: Predict and Evaluate
y_pred = ridge_reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Step 8: Apriori Algorithm (Assuming we are interested in user behavior patterns)
# For apriori, we need to transform data into boolean values (e.g., presence of an attribute)
transactions = data_cleaned[['retweet_count', 'profile_yn_True', 'tweet_location_Unknown']] > 0

# Apply the apriori algorithm
frequent_itemsets = apriori(transactions, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

print(frequent_itemsets)
print(rules)

/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-5222a9351d56> in <cell line: 36>()
     34 # Step 5: Scale the Data (Now that all features are numeric)
     35 scaler = StandardScaler()
---> 36 X_train_scaled = scaler.fit_transform(X_train)
     37 X_test_scaled = scaler.transform(X_test)
     38 

/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
    155     @wraps(f)
    156     def wrapped(self, X, *args, **kwargs):
--> 157         data_to_wrap = f(self, X, *args, **kwargs)
    158         if isinstance(data_to_wrap, tuple):
    159             # only wrap the first output for cross decomposition

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    914         if y is None:
    915             # fit method of arity 1 (unsupervised transformation)
--> 916             return self.fit(X, **fit_params).transform(X)
    917         else:
    918             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in fit(self, X, y, sample_weight)
    837         # Reset internal state before fitting
    838         self._reset()
--> 839         return self.partial_fit(X, y, sample_weight)
    840 
    841     @_fit_context(prefer_skip_nested_validation=True)

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
   1150                 )
   1151             ):
-> 1152                 return fit_method(estimator, *args, **kwargs)
   1153 
   1154         return wrapper

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y, sample_weight)
    873         """
    874         first_call = not hasattr(self, "n_samples_seen_")
--> 875         X = self._validate_data(
    876             X,
    877             accept_sparse=("csr", "csc"),

/usr/local/lib/python3.10/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    603                 out = X, y
    604         elif not no_val_X and no_val_y:
--> 605             out = check_array(X, input_name="X", **check_params)
    606         elif no_val_X and not no_val_y:
    607             out = _check_y(y, **check_params)

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    834         # Use the original dtype for conversion if dtype is None
    835         new_dtype = dtype_orig if dtype is None else dtype
--> 836         array = array.astype(new_dtype)
    837         # Since we converted here, we do not need to convert again later
    838         dtype = None

/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   6532         else:
   6533             # else, only a single dtype is given
-> 6534             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6535             res = self._constructor_from_mgr(new_data, axes=new_data.axes)
   6536             return res.__finalize__(self, method="astype")

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    412             copy = False
    413 
--> 414         return self.apply(
    415             "astype",
    416             dtype=dtype,

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
    352                 applied = b.apply(f, **kwargs)
    353             else:
--> 354                 applied = getattr(b, f)(**kwargs)
    355             result_blocks = extend_blocks(applied, result_blocks)
    356 

/usr/local/lib/python3.10/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, using_cow)
    614         values = self.values
    615 
--> 616         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    617 
    618         new_values = maybe_coerce_values(new_values)

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array_safe(values, dtype, copy, errors)
    236 
    237     try:
--> 238         new_values = astype_array(values, dtype, copy=copy)
    239     except (ValueError, TypeError):
    240         # e.g. _astype_nansafe can fail on object-dtype of strings

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in astype_array(values, dtype, copy)
    181 
    182     else:
--> 183         values = _astype_nansafe(values, dtype, copy=copy)
    184 
    185     # in pandas we don't store numpy str dtypes, so convert to object

/usr/local/lib/python3.10/dist-packages/pandas/core/dtypes/astype.py in _astype_nansafe(arr, dtype, copy, skipna)
    132     if copy or arr.dtype == object or dtype == object:
    133         # Explicit copy, or required since NumPy can't view from / to object.
--> 134         return arr.astype(dtype, copy=True)
    135 
    136     return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '2/7/14 11:55'

_

	_unit_id	_golden	_unit_state	_trusted_judgments	_last_judgment_at	gender	gender:confidence	profile_yn	profile_yn:confidence	created	...	profileimage	retweet_count	sidebar_color	text	tweet_coord	tweet_count	tweet_created	tweet_id	tweet_location	user_timezone
0	815719226	False	finalized	3	10/26/15 23:24	male	1.0000	yes	1.0	12/5/13 1:48	...	https://pbs.twimg.com/profile_images/414342229...	0	FFFFFF	Robbie E Responds To Critics After Win Against...	NaN	110964	10/26/15 12:40	6.587300e+17	main; @Kan1shk3	Chennai
1	815719227	False	finalized	3	10/26/15 23:30	male	1.0000	yes	1.0	10/1/12 13:51	...	https://pbs.twimg.com/profile_images/539604221...	0	C0DEED	ÛÏIt felt like they were my friends and I was...	NaN	7471	10/26/15 12:40	6.587300e+17	NaN	Eastern Time (US & Canada)
2	815719228	False	finalized	3	10/26/15 23:33	male	0.6625	yes	1.0	11/28/14 11:30	...	https://pbs.twimg.com/profile_images/657330418...	1	C0DEED	i absolutely adore when louis starts the songs...	NaN	5617	10/26/15 12:40	6.587300e+17	clcncl	Belgrade
3	815719229	False	finalized	3	10/26/15 23:10	male	1.0000	yes	1.0	6/11/09 22:39	...	https://pbs.twimg.com/profile_images/259703936...	0	C0DEED	Hi @JordanSpieth - Looking at the url - do you...	NaN	1693	10/26/15 12:40	6.587300e+17	Palo Alto, CA	Pacific Time (US & Canada)
4	815719230	False	finalized	3	10/27/15 1:15	female	1.0000	yes	1.0	4/16/14 13:23	...	https://pbs.twimg.com/profile_images/564094871...	0	0	Watching Neighbours on Sky+ catching up with t...	NaN	31462	10/26/15 12:40	6.587300e+17	NaN	NaN

	_unit_id	_golden	_unit_state	_trusted_judgments	_last_judgment_at	gender	gender:confidence	profile_yn:confidence	created	description	...	name	profileimage	retweet_count	sidebar_color	text	tweet_count	tweet_created	tweet_id	tweet_location	user_timezone
0	815719226	False	finalized	3	10/26/15 23:24	male	1.0000	1.0	12/5/13 1:48	i sing my own rhythm.	...	sheezy0	https://pbs.twimg.com/profile_images/414342229...	0	FFFFFF	Robbie E Responds To Critics After Win Against...	110964	10/26/15 12:40	6.587300e+17	main; @Kan1shk3	Chennai
1	815719227	False	finalized	3	10/26/15 23:30	male	1.0000	1.0	10/1/12 13:51	I'm the author of novels filled with family dr...	...	DavdBurnett	https://pbs.twimg.com/profile_images/539604221...	0	C0DEED	ÛÏIt felt like they were my friends and I was...	7471	10/26/15 12:40	6.587300e+17	Unknown	Eastern Time (US & Canada)
2	815719228	False	finalized	3	10/26/15 23:33	male	0.6625	1.0	11/28/14 11:30	louis whining and squealing and all	...	lwtprettylaugh	https://pbs.twimg.com/profile_images/657330418...	1	C0DEED	i absolutely adore when louis starts the songs...	5617	10/26/15 12:40	6.587300e+17	clcncl	Belgrade
3	815719229	False	finalized	3	10/26/15 23:10	male	1.0000	1.0	6/11/09 22:39	Mobile guy. 49ers, Shazam, Google, Kleiner Pe...	...	douggarland	https://pbs.twimg.com/profile_images/259703936...	0	C0DEED	Hi @JordanSpieth - Looking at the url - do you...	1693	10/26/15 12:40	6.587300e+17	Palo Alto, CA	Pacific Time (US & Canada)
4	815719230	False	finalized	3	10/27/15 1:15	female	1.0000	1.0	4/16/14 13:23	Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...	...	WilfordGemma	https://pbs.twimg.com/profile_images/564094871...	0	0	Watching Neighbours on Sky+ catching up with t...	31462	10/26/15 12:40	6.587300e+17	Unknown	NaN

	_unit_id	_golden	_unit_state	_trusted_judgments	_last_judgment_at	gender:confidence	profile_yn:confidence	created	description	...	retweet_count	sidebar_color	text	tweet_count	tweet_created	tweet_id	tweet_location	user_timezone	profile_created_year	tweet_created_year
0	815719226	False	finalized	3	10/26/15 23:24	1.0000	1.0	12/5/13 1:48	i sing my own rhythm.	...	-0.030196	FFFFFF	Robbie E Responds To Critics After Win Against...	0.602953	10/26/15 12:40	6.587300e+17	main; @Kan1shk3	Chennai	2013	2015
1	815719227	False	finalized	3	10/26/15 23:30	1.0000	1.0	10/1/12 13:51	I'm the author of novels filled with family dr...	...	-0.030196	C0DEED	ÛÏIt felt like they were my friends and I was...	-0.265805	10/26/15 12:40	6.587300e+17	Unknown	Eastern Time (US & Canada)	2012	2015
2	815719228	False	finalized	3	10/26/15 23:33	0.6625	1.0	11/28/14 11:30	louis whining and squealing and all	...	0.335804	C0DEED	i absolutely adore when louis starts the songs...	-0.281368	10/26/15 12:40	6.587300e+17	clcncl	Belgrade	2014	2015
3	815719229	False	finalized	3	10/26/15 23:10	1.0000	1.0	6/11/09 22:39	Mobile guy. 49ers, Shazam, Google, Kleiner Pe...	...	-0.030196	C0DEED	Hi @JordanSpieth - Looking at the url - do you...	-0.314308	10/26/15 12:40	6.587300e+17	Palo Alto, CA	Pacific Time (US & Canada)	2009	2015
4	815719230	False	finalized	3	10/27/15 1:15	1.0000	1.0	4/16/14 13:23	Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...	...	-0.030196	0	Watching Neighbours on Sky+ catching up with t...	-0.064416	10/26/15 12:40	6.587300e+17	Unknown	NaN	2014	2015

	gender:confidence	description	fav_number	link_color	retweet_count	sidebar_color	text	tweet_count	tweet_id
0	1.0000	i sing my own rhythm.	-0.353977	08C2C2	-0.030196	FFFFFF	Robbie E Responds To Critics After Win Against...	0.602953	6.587300e+17
1	1.0000	I'm the author of novels filled with family dr...	-0.348524	0084B4	-0.030196	C0DEED	ÛÏIt felt like they were my friends and I was...	-0.265805	6.587300e+17
2	0.6625	louis whining and squealing and all	0.263273	ABB8C2	0.335804	C0DEED	i absolutely adore when louis starts the songs...	-0.281368	6.587300e+17
3	1.0000	Mobile guy. 49ers, Shazam, Google, Kleiner Pe...	-0.337776	0084B4	-0.030196	C0DEED	Hi @JordanSpieth - Looking at the url - do you...	-0.314308	6.587300e+17
4	1.0000	Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...	2.639077	3B94D9	-0.030196	0	Watching Neighbours on Sky+ catching up with t...	-0.064416	6.587300e+17

	gender	description
0	0	i sing my own rhythm.
1	0	I'm the author of novels filled with family dr...
2	0	louis whining and squealing and all
3	0	Mobile guy. 49ers, Shazam, Google, Kleiner Pe...
4	0	Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...
...	...	...
20045	0	(rp)
20046	0	Whatever you like, it's not a problem at all. ...
20047	0	#TeamBarcelona ..You look lost so you should f...
20048	0	Anti-statist; I homeschool my kids. Aspiring t...
20049	0	Teamwork makes the dream work.

Handling Missing Data¶

Exploratory Data Analysis (EDA)¶

Preprocessing¶

Regression Tasks¶

Example Usage¶

	gender	description
0	0	i sing my own rhythm
1	0	i m the author of novels filled with family dr...
2	0	louis whining and squealing and all
3	0	mobile guy ers shazam google kleiner pe...
4	0	ricky wilson the best frontman kaiser chiefs t...
...	...	...
20045	0	rp
20046	0	whatever you like it s not a problem at all ...
20047	0	teambarcelona you look lost so you should f...
20048	0	anti statist i homeschool my kids aspiring t...
20049	0	teamwork makes the dream work

	gender	description	tokenized
0	0	i sing my own rhythm	[sing, rhythm]
1	0	i m the author of novels filled with family dr...	[author, novels, filled, family, drama, romance]
2	0	louis whining and squealing and all	[louis, whining, squealing]
3	0	mobile guy ers shazam google kleiner pe...	[mobile, guy, ers, shazam, google, kleiner, pe...
4	0	ricky wilson the best frontman kaiser chiefs t...	[ricky, wilson, best, frontman, kaiser, chiefs...
...	...	...	...
20045	0	rp	[rp]
20046	0	whatever you like it s not a problem at all ...	[whatever, like, problem, chargernation, forev...
20047	0	teambarcelona you look lost so you should f...	[teambarcelona, look, lost, follow, follow, he...
20048	0	anti statist i homeschool my kids aspiring t...	[anti, statist, homeschool, kids, aspiring, th...
20049	0	teamwork makes the dream work	[teamwork, makes, dream, work]