<a href="https://colab.research.google.com/github/carlaost/fundingdominance/blob/main/tweet_analysis/tweet_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###setup

In [None]:
!pip install tensorflow_text &> /dev/null
!pip install transformers &> /dev/null

In [None]:
import pandas as pd
import re
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text
import os
from datetime import datetime
import time

In [4]:
general = pd.read_csv('https://raw.githubusercontent.com/carlaost/fundingdominance/main/data/processed/dealid_faces.csv')
df = pd.DataFrame(columns=['deal_id', 'deal_size', 'deal_date', 'ceo_id', 'username', 'network', 'smile', 'tweets_v', 'tweets_a', 'tweets_d', 'tweet_count'])

In [None]:
for i in range(len(general)):
  if general['tweets_period'][i] == 'ok' or general['tweets_period'][i] == 'hyperactive':
    info = {'deal_id': general['Deal ID'][i],
            'deal_size': general['Deal Size'][i],
            'deal_date': general['Deal Date'][i],  
            'ceo_id': general['CEO PBId'][i], 
            'username': general['usernames_x'][i], 
            'network': general['network size_x'][i], 
            'smile': general['smile_type_w_x'][i]}
    df = df.append(info, ignore_index = True)

###functions

In [None]:
def create_model():
  preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3')
  bert = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4', trainable=False)
  
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
  bert_inputs = preprocessor(text_input)
  bert_outputs = bert(bert_inputs)
  cls_embedding = bert_outputs['pooled_output']

  inter1 = tf.keras.layers.Dense(64, name='inter1')(cls_embedding)
  inter2 = tf.keras.layers.Dense(64, name='inter2')(inter1)

  output1 = tf.keras.layers.Dense(1, name='V')(inter2)
  output2 = tf.keras.layers.Dense(1, name='A')(inter2)
  output3 = tf.keras.layers.Dense(1, name='D')(inter2)

  model = tf.keras.Model(
    inputs=text_input, 
    outputs=[output1, output2, output3])
  
  optimizer='rmsprop'
  loss={'V':'mse', 'A':'mse', 'D':'mse'}
  metrics={'V':'mae', 'A':'mae', 'D':'mae'}

  model.compile(optimizer=optimizer,loss=loss,metrics=metrics)
  
  return model

In [None]:
def predict(model, clean):
  output = model.predict([clean])
  v = output[0]
  v = v[0][0]
  a = output[1]
  a = a[0][0]
  d = output[2]
  d = d[0][0]
  return v, a, d

In [None]:
def get_info(i):
  username = df['username'][i]
  deal_date = df['deal_date'][i]
  deal_date = datetime.strptime(str(deal_date), '%d-%b-%Y')
  return username, deal_date

In [None]:
def emotion_analysis(f, deal_date):

  tweets = pd.read_csv(f)
  result = pd.DataFrame(columns=['text', 'v', 'a', 'd'])
  tweet_count = 0

  for tweet in range(len(tweets)):
    tweet_date = tweets['Datetime'][tweet]
    tweet_date = datetime.strptime(str(tweet_date), '%Y-%m-%d %H:%M:%S')
    
    if tweet_date < deal_date and (deal_date - tweet_date) < (datetime(2022, 5, 27) - datetime(2021, 5, 27)):
      clean = re.sub(r'@\w*', '', str(tweets['Text'][tweet]))
      clean = re.sub(r'https\S*', '', str(clean))
      v, a, d = predict(model, clean)
      result = result.append({'text': clean, 'v':v, 'a':a, 'd':d}, ignore_index=True)
      tweet_count += 1
    
    else: continue

  return result, tweet_count

In [None]:
def means(result):
  v = result['v'].mean()
  a = result['a'].mean()
  d = result['d'].mean()
  return v, a, d

In [None]:
def run(df, i):

  username, deal_date = get_info(i)
  
  f = 'xxx/{}-tweets.csv'.format(username) # add directory / tweet filepath
  if os.path.isfile(f):
    result, tweet_count = emotion_analysis(f, deal_date)    
    result.to_csv('xxx/{}_analyzed.csv'.format(username)) # add directory / tweet filepath
    df['tweets_v'][i], df['tweets_a'][i], df['tweets_d'][i] = means(result)
    df['tweet_count'][i] = tweet_count

    else:
      print(username, ': no file')

###emotion analysis

This script iterates through all tweet files, and requires to load the EmoBERT model weights from checkpoint.

Tweet files can be obtained using the [tweet scraper](https://github.com/carlaost/fundingdominance/blob/main/data/twitter_scraper.ipynb), EmoBERT checkpoint can be downloaded [here](https://drive.google.com/drive/folders/1-LTXEh-xGuAyNCZbBqtbQ6rjoJKZUd56?usp=sharing). Filepaths need to be adjusted.

In [None]:
model = create_model()
checkpoint = xxx # filepath to model checkpoint
model.load_weights(checkpoint)

In [None]:
batch_size = 50
batches = len(df) // batch_size
rest = len(df) % batches

In [None]:
for batch in range(batches):
  start = batch * batch_size
  end = start + batch_size

  for i in range(start, end):
    run(df, i)

  df.to_csv(xxx) # filepath to save output file  
  print('Batch no.', batch, '(+1) /', batches, 'done.')

for i in range(int(len(df)-rest), len(df)):
  run(df, i)

df.to_csv(xxx) # filepath to save output file   