In [1]:
import bertopic
bertopic.__version__

'0.11.0'

In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# factor analysis
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
import pingouin as pg
from scipy.stats import zscore

# clustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# nlp
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from nltk.tokenize import sent_tokenize

# custom
import survey_analytics_library as LIB
import importlib
importlib.reload(LIB)

  return warn(


<module 'survey_analytics_library' from 'e:\\Github\\survey_analytics\\survey_analytics_library.py'>

In [2]:
# read data
data = pd.read_excel('data/bfi_data.xlsx', sheet_name='bfi')
data_dict = pd.read_excel('data/bfi_data.xlsx', sheet_name='questions_clean')

# drop demographics data for factor analysis
df_factor_analysis = data.drop(['gender', 'education', 'age'], axis=1)
# drop null data
df_factor_analysis = df_factor_analysis.dropna()
# convert data from float to int
for col in df_factor_analysis:
    df_factor_analysis[col] = df_factor_analysis[col].apply(int)

# generate column names
bfi_questions = []
for i in range(1,26):
    bfi_questions.append('Q'+str(i))
# rename columns
df_factor_analysis.columns = bfi_questions

In [6]:
def read_survey_data():
    data_survey = pd.read_csv(r'data\bfi_sample_answers.csv')
    data_questions = pd.read_csv(r'data\bfi_sample_questions.csv')
    return data_survey, data_questions
data_survey, data_questions = read_survey_data()

# copy daya
df_factor_analysis = data_survey.copy()

In [7]:
display(data_questions.iloc[:25])
display(df_factor_analysis)

Unnamed: 0,question,description
0,Q1,Am indifferent to the feelings of others.
1,Q2,Inquire about others' well-being.
2,Q3,Know how to comfort others.
3,Q4,Love children.
4,Q5,Make people feel at ease.
5,Q6,Am exacting in my work.
6,Q7,Continue until everything is perfect.
7,Q8,Do things according to a plan.
8,Q9,Do things in a half-way manner.
9,Q10,Waste my time.


Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25
0,2,4,3,4,4,2,3,3,4,4,...,3,4,2,2,3,3,6,3,4,3
1,2,4,5,2,5,5,4,4,3,4,...,3,3,3,5,5,4,2,4,3,3
2,5,4,5,4,4,4,5,4,2,5,...,4,5,4,2,3,4,2,5,5,2
3,4,4,6,5,5,4,4,3,5,5,...,2,5,2,4,1,3,3,4,3,5
4,2,3,3,4,5,4,4,5,3,2,...,2,3,4,4,3,3,3,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,3,4,3,1,3,5,4,5,3,4,...,5,6,5,5,6,6,1,4,5,2
2432,5,5,5,5,3,5,4,2,3,5,...,4,5,3,5,2,6,1,5,6,2
2433,2,3,5,2,5,5,5,5,1,1,...,3,4,3,3,1,5,1,6,4,3
2434,5,2,2,4,4,5,5,5,2,6,...,5,5,6,4,1,5,2,5,5,1


 # Factor Analysis

In [8]:
# define factor analyser model
fa = FactorAnalyzer()
# fit data
fa.fit(X=df_factor_analysis)

# get eigenvalues
eigenvalues, _ = fa.get_eigenvalues()
# get number of eigenvalues more than or equal to 1
optimal_factors = len([value for value in eigenvalues if value >= 1])
# store eigenvalues and number of clusters into a df for plotly
scree_df = pd.DataFrame({'Eigenvalues':eigenvalues, 'Number of Clusters':list(range(1, len(eigenvalues)+1))}) 

In [9]:
# plot scree plot
fig = px.line(
    scree_df,
    x='Number of Clusters', 
    y='Eigenvalues',
    markers=True,
    template='simple_white',
    width=800,
    height=500,
    )
fig.add_hline(y=1, line_width=3, line_color='darkgreen')
fig.show()

In [10]:
# Test with the null hypothesis that the correlation matrix is an identity matrix
bartlett_sphericity_stat, p_value = calculate_bartlett_sphericity(x=df_factor_analysis)
print('bartlett sphericity stat p value (above 0.05 is considered inadequate): ', p_value)
# Test how predictable of a variable by others
kmo_per_variable, kmo_total = calculate_kmo(x=df_factor_analysis)
print('kmo total (lower than 0.6 is considered inadequate): ', kmo_total)

bartlett sphericity stat p value (above 0.05 is considered inadequate):  0.0
kmo total (lower than 0.6 is considered inadequate):  0.848539722194922


In [11]:
# define factor analyser model
fa = FactorAnalyzer(n_factors=optimal_factors, rotation='varimax')
# fit data
fa.fit(df_factor_analysis)
# generate factor loadings
loads_df = pd.DataFrame(fa.loadings_, index=df_factor_analysis.columns)

In [12]:
# Communality is the proportion of the variable’s variance explained by the factors
communalities_df = pd.DataFrame(fa.get_communalities(), index=df_factor_analysis.columns, columns=['communality'])

# display(loads_df.style.highlight_max(axis=1, props='color:yellow; font-weight:bold;'))
communalities_df['factor_group'] = loads_df.apply(lambda s: s.argmax(), axis=1)

# compute cronbach alpha measuring internal consistency [-inf, 1]. Above 0.6 are considered sufficient
factor_alpha_df = []
for f_i in loads_df.columns:
    factor_cols = communalities_df[communalities_df['factor_group'] == f_i].index.tolist()
    factor_df = df_factor_analysis[factor_cols]
#     display(factor_df.shape)
    factor_alpha = pg.cronbach_alpha(factor_df)[0]
    factor_alpha_df.append([f_i, factor_alpha])

factor_alpha_df = pd.DataFrame(factor_alpha_df, columns=['factor', 'alpha'])
# display(factor_alpha_df)


In [13]:
transformed_df = fa.fit_transform(df_factor_analysis)
transformed_df = pd.DataFrame(transformed_df)
transformed_df.columns = ['factor_'+str(col) for col in list(transformed_df)]
print(transformed_df.shape)

responder_factors = transformed_df.copy()

responder_factors['cluster'] = responder_factors.apply(lambda s: s.argmax(), axis=1)
# display(loads_df.style.highlight_max(axis=1, props='color:white; font-weight:bold; background-color:darkblue;'))
communalities_df['factor_group'] = loads_df.apply(lambda s: s.argmax(), axis=1)
# list of factor columns
list_of_factor_cols = [col for col in responder_factors.columns if 'factor_' in col]
# highlight factor with max loadings
display(responder_factors.iloc[:20].style.highlight_max(axis=1, subset=list_of_factor_cols, props='color:white; font-weight:bold; background-color:green;'))

(2436, 6)


Unnamed: 0,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,cluster
0,-0.350506,0.033583,-1.300285,-0.512135,-1.429475,-0.693275,1
1,0.081829,0.570999,-0.612138,-0.201342,-0.243352,-0.016911,1
2,0.564684,0.327277,0.083021,-0.824345,0.210169,-0.236172,0
3,-0.232325,0.0713,-0.963956,-0.26827,-1.187286,0.834096,5
4,-0.337498,0.364706,-0.137084,-0.798106,-0.675213,-0.190357,1
5,0.27071,1.156427,1.365775,-0.29669,0.260791,-0.040217,2
6,-1.237221,-0.057878,-0.047064,0.099452,0.83063,0.028969,4
7,0.449962,-1.351683,-0.753653,-1.596243,-0.249111,-0.978859,0
8,0.989377,0.783774,1.082102,0.278619,0.163224,-0.574748,2
9,-0.192677,0.342784,-0.226034,-0.208836,-0.354228,-0.003116,1


In [14]:
fa_clusters = df_factor_analysis.copy().reset_index(drop=True)
fa_clusters['cluster'] = responder_factors['cluster']
fa_clusters.groupby('cluster').mean().reset_index()

Unnamed: 0,cluster,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25
0,0,2.62069,4.643002,4.243408,4.330629,3.973631,4.271805,4.212982,4.097363,3.066937,...,4.880325,5.158215,4.833671,4.51927,4.233266,4.525355,3.087221,4.1643,5.146045,2.6714
1,1,2.524664,4.887892,4.919283,4.82287,4.94843,4.316143,4.026906,4.150224,2.544843,...,2.943946,3.403587,2.737668,2.134529,2.338565,4.928251,2.816143,4.618834,4.242152,2.560538
2,2,2.611905,4.8,4.388095,5.17381,4.466667,5.245238,5.345238,5.121429,1.388095,...,2.480952,3.164286,2.688095,2.597619,2.704762,4.780952,2.57381,4.159524,4.652381,2.635714
3,3,1.365789,5.547368,5.331579,5.236842,5.076316,4.192105,4.007895,4.313158,2.373684,...,2.007895,2.802632,2.697368,3.0,2.994737,4.394737,2.760526,4.092105,5.021053,2.423684
4,4,2.20202,4.527778,4.123737,3.939394,4.164141,4.631313,4.176768,3.954545,2.411616,...,2.393939,3.257576,2.866162,3.333333,2.224747,5.416667,1.535354,5.260101,5.558081,1.409091
5,5,3.17608,4.318937,4.697674,4.684385,4.810631,4.524917,4.504983,4.146179,3.734219,...,2.322259,2.737542,3.196013,3.554817,3.166113,4.890365,3.401993,4.458472,5.003322,3.219269


In [15]:
fa_z_scores = df_factor_analysis.copy().reset_index(drop=True)
fa_z_scores = fa_z_scores.apply(zscore)
fa_z_scores['cluster'] = responder_factors['cluster']
fa_z_scores = fa_z_scores.groupby('cluster').mean().reset_index()
fa_z_scores = fa_z_scores.apply(lambda x: round(x, 2))
fa_z_scores

Unnamed: 0,cluster,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25
0,0,0.15,-0.13,-0.27,-0.24,-0.45,-0.21,-0.12,-0.16,0.38,...,1.23,1.07,1.01,0.84,0.78,-0.26,0.26,-0.24,0.19,0.15
1,1,0.08,0.08,0.24,0.09,0.32,-0.17,-0.26,-0.12,-0.0,...,0.0,-0.07,-0.31,-0.68,-0.39,0.1,0.08,0.14,-0.57,0.07
2,2,0.15,0.0,-0.16,0.33,-0.06,0.58,0.74,0.64,-0.84,...,-0.29,-0.23,-0.34,-0.39,-0.16,-0.03,-0.07,-0.24,-0.23,0.13
3,3,-0.74,0.64,0.56,0.37,0.42,-0.27,-0.28,0.01,-0.13,...,-0.59,-0.47,-0.33,-0.13,0.01,-0.37,0.05,-0.3,0.08,-0.03
4,4,-0.15,-0.23,-0.36,-0.5,-0.3,0.09,-0.15,-0.27,-0.1,...,-0.35,-0.17,-0.22,0.08,-0.46,0.54,-0.74,0.67,0.53,-0.8
5,5,0.55,-0.41,0.08,-0.0,0.21,-0.0,0.1,-0.12,0.86,...,-0.39,-0.51,-0.02,0.22,0.12,0.07,0.46,0.01,0.07,0.57


In [16]:
cm = sns.light_palette('green', as_cmap=True)
list_of_question_cols = list(fa_z_scores.iloc[:,1:])
fa_z_scores.style.background_gradient(cmap=cm, subset=list_of_question_cols).format(precision=2)

Unnamed: 0,cluster,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25
0,0,0.15,-0.13,-0.27,-0.24,-0.45,-0.21,-0.12,-0.16,0.38,0.55,0.08,0.49,-0.27,-0.37,-0.15,1.23,1.07,1.01,0.84,0.78,-0.26,0.26,-0.24,0.19,0.15
1,1,0.08,0.08,0.24,0.09,0.32,-0.17,-0.26,-0.12,-0.0,-0.22,-0.82,-0.9,0.52,0.73,0.46,0.0,-0.07,-0.31,-0.68,-0.39,0.1,0.08,0.14,-0.57,0.07
2,2,0.15,0.0,-0.16,0.33,-0.06,0.58,0.74,0.64,-0.84,-0.88,0.29,-0.06,-0.19,0.0,0.26,-0.29,-0.23,-0.34,-0.39,-0.16,-0.03,-0.07,-0.24,-0.23,0.13
3,3,-0.74,0.64,0.56,0.37,0.42,-0.27,-0.28,0.01,-0.13,0.0,0.0,-0.01,-0.18,0.13,-0.36,-0.59,-0.47,-0.33,-0.13,0.01,-0.37,0.05,-0.3,0.08,-0.03
4,4,-0.15,-0.23,-0.36,-0.5,-0.3,0.09,-0.15,-0.27,-0.1,0.19,0.07,0.12,-0.02,-0.55,0.01,-0.35,-0.17,-0.22,0.08,-0.46,0.54,-0.74,0.67,0.53,-0.8
5,5,0.55,-0.41,0.08,-0.0,0.21,-0.0,0.1,-0.12,0.86,0.39,0.57,0.47,0.17,0.09,-0.35,-0.39,-0.51,-0.02,0.22,0.12,0.07,0.46,0.01,0.07,0.57


In [17]:
cluster_counts = fa_clusters['cluster'].value_counts().reset_index()
cluster_counts = cluster_counts.rename(columns={'index':'Cluster', 'cluster':'Count'})
cluster_counts

fig = px.pie(
    cluster_counts,
    values='Count', 
    names='Cluster', 
    hole=0.35,
    title='Percentage of Responders in Each Cluster',
    template='simple_white',
    width=1000,
    height=600,
    )
fig.show()

 # Demographic Clustering

In [18]:
cus_data = pd.read_csv('data/cus_data.csv')
cus_data = cus_data[['Age', 'Income']]
cus_data

Unnamed: 0,Age,Income
0,41,19
1,47,100
2,33,57
3,29,19
4,47,253
...,...,...
845,27,26
846,28,34
847,25,18
848,32,28


In [19]:
scaler = StandardScaler()
cus_data_scaled = scaler.fit_transform(cus_data)

In [20]:
# inertia = LIB.create_elbow_plot_kmeans(cus_data_scaled, 15)
# silhouette_scores = LIB.silhouette_score_plot_kmeans(cus_data_scaled, 15)

In [21]:
# define optimal number of clusters
optimal_clusters = 5

# define kmeans model
kmeans = KMeans(optimal_clusters, random_state=42)

# fit and predict model
clusters = kmeans.fit_predict(cus_data_scaled)

cus_data['cluster'] = clusters


In [22]:
# plot with clusters as categories
fig = px.scatter(
    cus_data, 
    x='Income', 
    y='Age',
    color=cus_data['cluster'].astype(str),
    template='simple_white',
    width=1200,
    height=600
    )
fig.update_traces(mode='markers')
fig.show()


In [23]:
cus_data = pd.read_csv('data/cus_data.csv')
cus_data = cus_data[['Age', 'Edu', 'Years Employed', 'Income']]
cus_data

Unnamed: 0,Age,Edu,Years Employed,Income
0,41,2,6,19
1,47,1,26,100
2,33,2,10,57
3,29,2,4,19
4,47,1,31,253
...,...,...,...,...
845,27,1,5,26
846,28,2,7,34
847,25,4,0,18
848,32,1,12,28


In [24]:
scaler = StandardScaler()
cus_data_scaled = scaler.fit_transform(cus_data)


In [26]:
# inertia = LIB.create_elbow_plot_kmeans(cus_data_scaled, 15)
# silhouette_scores = LIB.silhouette_score_plot_kmeans(cus_data_scaled, 15)


In [27]:
# define optimal number of clusters
optimal_clusters = 7

# define kmeans model
kmeans = KMeans(optimal_clusters, random_state=42)

# fit and predict model
clusters = kmeans.fit_predict(cus_data_scaled)

cus_data['cluster'] = clusters


In [28]:
# define pca model
pca = PCA(n_components=3, random_state=42).fit(cus_data_scaled)
# print total variance
print(f'Total Variance: {pca.explained_variance_ratio_.sum()}')
print(f'Variance per Component: {pca.explained_variance_ratio_}')
print(f'Cumulative Variance: {pca.explained_variance_ratio_.cumsum()}')

# fit and transform data
pca = pca.transform(cus_data_scaled)
# store components as dataframe
pca = pd.DataFrame(pca)


Total Variance: 0.9317066077075122
Variance per Component: [0.52689815 0.27380438 0.13100408]
Cumulative Variance: [0.52689815 0.80070253 0.93170661]


In [29]:
# plot with clusters as categories
fig = px.scatter(
    pca, 
    x=0, 
    y=1,
    color=cus_data['cluster'].astype(str),
    template='simple_white',
    width=1200,
    height=600
    )
fig.update_traces(mode='markers')
fig.show()


In [30]:
# plot with clusters as categories
fig = px.scatter_3d(
    pca, 
    x=0, 
    y=1,
    z=2,
    color=cus_data['cluster'].astype(str),
    template='simple_white',
    width=1200,
    height=600
    )
fig.update_traces(mode='markers')
fig.show()


 # Text Analysis

 ## Data Cleaning

In [3]:
tokyo = pd.read_csv('data/tokyo_olympics_tweets.csv')

In [4]:
# define list of regex patterns for replacement
list_of_replacements = [
    (r'tokyo', ''), 
    (r'olympics?', ''),
    (r'tokyoolympics', ''),
    (r'2020', ''),
    (r'2021', ''),
    (r'games?', ''),
    (r'https?', ''),
    ]

# clean text, lowercase text, remove special characters, remove numbers
tokyo['clean_text'] = tokyo['text'].apply(lambda x: LIB.clean_text(x, list_of_replacements, lowercase=False, ignorecase=True))

In [5]:
docs = list(tokyo['clean_text'])
len(docs)

10000

 ## BERTopic

In [17]:
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=42
    )

topic_model = BERTopic(
    embedding_model=r'E:\Github\huggingface\all-MiniLM-L6-v2',
    umap_model=umap_model,
    vectorizer_model=CountVectorizer(ngram_range=(1,2), stop_words='english'),
    nr_topics='auto',
    min_topic_size=15,
    calculate_probabilities=False, 
    verbose=True
    )
    
# topics, probs = topic_model.fit_transform(docs)
topics, probs = topic_model.fit_transform(list(tokyo['text']))
topic_model.save(r'models/bertopic_model_tokyo_olympics_tweets_unclean')


2022-07-14 19:54:24.827 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: E:\Github\huggingface\all-MiniLM-L6-v2
2022-07-14 19:54:25.088 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2022-07-14 19:56:25,841 - BERTopic - Transformed documents to Embeddings
2022-07-14 19:56:54,517 - BERTopic - Reduced dimensionality
2022-07-14 19:56:55,097 - BERTopic - Clustered reduced embeddings
2022-07-14 19:56:59,646 - BERTopic - Reduced number of topics from 117 to 67


In [19]:
def load_bertopic_model():
    topic_model = BERTopic.load(r'models/bertopic_model_tokyo_olympics_tweets_unclean')
    return topic_model
topic_model = load_bertopic_model()
# topics, probs = topic_model.transform(docs)

In [20]:
# vectorizer_model = CountVectorizer(
#     ngram_range=(1,2), 
#     stop_words='english'
#     )
# topic_model.update_topics(docs, topics, vectorizer_model=vectorizer_model)

In [21]:
labelled_topics = [
    'Mirabai Chanu (Indian Weightlifter)',
    'Hockey',
    'Barbra Banda (Zambian Football Player)',
    'Sutirtha Mukherjee (Indian Table Tennis Player)',
    'Vikas Krishan (Indian Boxer)',
    'Road Race',
    'Brendon Smith (Australian Swimmer)',
    'Sam Kerr (Australian Footballer)',
    ]

In [22]:
display(topic_model.get_topic_info())

# topic_model.visualize_barchart(
#     n_words=5,
#     top_n_topics=8,
#     width=300, 
#     height=300
#     )

LIB.visualize_barchart_titles(
    topic_model=topic_model,
    subplot_titles=labelled_topics,
    n_words=5,
    top_n_topics=8,
    height=300
)

Unnamed: 0,Topic,Count,Name
0,-1,3144,-1_https_tokyo2020_india_medal
1,0,1221,0_medal_silver_india_mirabaichanu
2,1,687,1_olympics_swimming_olympics tokyo2020_tokyo2020
3,2,649,2_round_sutirtha_vikas_mukherjee
4,3,398,3_banda_zambia_barbra_barbra banda
...,...,...,...
62,61,17,61_2020 india_start tokyo_asked happier_happie...
63,62,17,62_ausvswe_aussies_aus_ausvswe tokyo2020
64,63,15,63_handball_esp_tokyo2020 handball_27
65,64,15,64_hend_hend zaza_zaza_youngest


In [43]:
topic_model.get_topic(8)[:5]
[i[0] for i in topic_model.get_topic(8)[:5]]

['tennis', 'round', 'murray', 'singles', 'nagal']

In [44]:
# results = pd.DataFrame({'Tweet':tokyo['text']}).copy()
# results['Topic'] = topics
results = pd.read_csv(r'data/topic_results.csv')
# results.to_csv('test.csv', index=False)
results.loc[(results['Topic'] == 8)]

Unnamed: 0,Tweet,Topic
11,Gymnastics ❤️ #Tokyo2020,8
39,Gymnastics on the TV and I’m happy\r\n\r\n#Tok...,8
62,@channel7 Please do not switch from Aussie swi...,8
260,With no official fans here at the #Tokyo2020 ...,8
266,*watches the men's gymnastics for two minutes*...,8
...,...,...
9901,"Gymnastics impossible to watch, can’t hear the...",8
9905,@BBCSport #Tokyo2020 Gymnastics. Why can we no...,8
9917,Nyjah Huston and skateboarding hit the Olympic...,8
9937,Cor what a wonderful morning watching the gymn...,8


In [15]:
def _plotly_topic_visualization(df: pd.DataFrame,
                                topic_list: List[str],
                                width: int,
                                height: int):
    """ Create plotly-based visualization of topics with a slider for topic selection """

    def get_color(topic_selected):
        if topic_selected == -1:
            marker_color = ["#B0BEC5" for _ in topic_list]
        else:
            marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list]
        return [{'marker.color': [marker_color]}]

    # Prepare figure range
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))

    # Plot topics
    fig = px.scatter(df, x="x", y="y", size="Size", size_max=40, template="simple_white", labels={"x": "", "y": ""},
                     hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False})
    fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color='DarkSlateGrey')))

    # Update hover order
    fig.update_traces(hovertemplate="<br>".join(["<b>Topic %{customdata[0]}</b>",
                                                 "Words: %{customdata[1]}",
                                                 "Size: %{customdata[2]}"]))

    # Create a slider for topic selection
    steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list]
    sliders = [dict(active=0, pad={"t": 50}, steps=steps)]

    # Stylize layout
    fig.update_layout(
        title={
            'text': "<b>Intertopic Distance Map",
            'y': .95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        xaxis={"visible": False},
        yaxis={"visible": False},
        sliders=sliders
    )

    # Update axes ranges
    fig.update_xaxes(range=x_range)
    fig.update_yaxes(range=y_range)

    # Add grid in a 'plus' shape
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
    fig.data = fig.data[::-1]

    return fig

In [14]:
from typing import List
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
def visualize_topics_modified(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words

    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure.
        height: The height of the figure.

    Usage:

    To visualize the topics simply run:

    ```python
    topic_model.visualize_topics()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/viz.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1],
                       "Topic": topic_list, "Words": words, "Size": frequencies})
    return _plotly_topic_visualization(df, topic_list, width, height)

In [45]:
topic_model.visualize_topics()

 ## Transformers

 ### Sentiment Analysis

In [56]:
imdb = pd.read_csv(r'data/imdb.csv')

In [57]:
sample = imdb['review'][1]

# define list of regex patterns for replacement
list_of_replacements = [
    (r'<br />', ' '), 
    ]
tokenised_sample = LIB.clean_text(sample, list_of_replacements, lowercase=False)

tokenised_sample

'A wonderful little production.   The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.   The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life.   The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [58]:
classifier_sentiment = pipeline(
    task='sentiment-analysis', 
    model=r'C:\Projects\huggingface\distilbert-base-uncased-finetuned-sst-2-english', 
    return_all_scores=True
    )

In [112]:
classifier_sentiment(tokenised_sample)


[[{'label': 'NEGATIVE', 'score': 0.0007469278643839061},
  {'label': 'POSITIVE', 'score': 0.9992530941963196}]]

 ### Zero-Shot Classification

In [66]:
classifier_zero_shot = pipeline(
    task='zero-shot-classification', 
    model=r'C:\Projects\huggingface\distilbart-mnli-12-1', 
    return_all_scores=True
    )


In [67]:
classifier_zero_shot(sample, ['violent', 'romance', 'comedy'])

{'sequence': 'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are te

In [47]:
# define list of regex patterns for replacement
list_of_replacements = [
    (r'<br />', ' '), 
    ]
tokenised_sample = LIB.clean_text(sample, list_of_replacements, lowercase=False)

In [115]:
tokenised_sample = sent_tokenize(tokenised_sample)
tokenised_sample

['A wonderful little production.',
 'The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.',
 'The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too!',
 "You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece.",
 "A masterful production about one of the great master's of comedy and his life.",
 "The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears.",
 "It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done."]

In [None]:
# define list of regex patterns for replacement
list_of_replacements = [
    (r'<br />', ' '), 
    ]


In [61]:
imdb_df = imdb.copy()
imdb_df['review_clean'] = imdb_df['review'].apply(lambda x: LIB.clean_text(x, list_of_replacements, lowercase=False, ignorecase=False))
imdb_df

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming t...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...,...,...
4995,An interesting slasher film with multiple susp...,negative,An interesting slasher film with multiple susp...
4996,i watched this series when it first came out i...,positive,i watched this series when it first came out i...
4997,Once again Jet Li brings his charismatic prese...,positive,Once again Jet Li brings his charismatic prese...
4998,"I rented this movie, after hearing Chris Gore ...",negative,"I rented this movie, after hearing Chris Gore ..."


In [62]:
# create empty list
sample_data_sentences = []
# sentence tokenise text
imdb_df['review_clean'].apply(lambda x: sample_data_sentences.append(sent_tokenize(x)))
# flatten list of lists
sample_data_sentences = [item for sublist in sample_data_sentences for item in sublist]
# keep sentences that have length of three or more
sample_data_sentences = [sentence for sentence in sample_data_sentences if len(sentence) >= 3]

In [64]:
candidate_labels = [
    'action',
    'romance',
    'comedy',
    'horror',
]

In [70]:
# get prediction on list of inputs
zero_shot_classification = classifier_zero_shot(list(sample_data_sentences[:50]), candidate_labels)
# get prediction on list of inputs
# zero_shot_classification = zero_shot_pipline(list(sample_data_sentences[:100]), candidate_labels)
# get prediction on list of inputs
# zero_shot_classification = zero_shot_pipline(manual_samples, candidate_labels)

# convert transformer model zero shot classification prediction into dataframe
zero_shot_results_sample = LIB.convert_zero_shot_classification_output_to_dataframe(zero_shot_classification)

# replace scores with nan where input is blank
for col in zero_shot_results_sample.iloc[:,1:]:
    zero_shot_results_sample[col] = np.where(zero_shot_results_sample['sequence'] == ' ', np.nan, zero_shot_results_sample[col])

# add prefix
zero_shot_results_sample = zero_shot_results_sample.add_prefix('zero_shot_')

# display
zero_shot_results_sample

Unnamed: 0,zero_shot_sequence,zero_shot_labels_scores,zero_shot_action,zero_shot_comedy,zero_shot_horror,zero_shot_romance,zero_shot_label,zero_shot_score
0,One of the other reviewers has mentioned that ...,"{'action': 0.6853489279747009, 'comedy': 0.222...",0.685349,0.222931,0.067338,0.024382,action,0.685349
1,"They are right, as this is exactly what happen...","{'action': 0.730780839920044, 'horror': 0.2363...",0.730781,0.023847,0.236363,0.009009,action,0.730781
2,The first thing that struck me about Oz was it...,"{'action': 0.5329533815383911, 'horror': 0.463...",0.532953,0.002117,0.463369,0.001561,action,0.532953
3,"Trust me, this is not a show for the faint hea...","{'horror': 0.4558163583278656, 'action': 0.439...",0.43984,0.07689,0.455816,0.027454,horror,0.455816
4,This show pulls no punches with regards to dru...,"{'action': 0.8655638098716736, 'horror': 0.079...",0.865564,0.050077,0.079226,0.005133,action,0.865564
5,"Its is hardcore, in the classic use of the word.","{'action': 0.5196256041526794, 'horror': 0.460...",0.519626,0.012331,0.460286,0.007758,action,0.519626
6,It is called OZ as that is the nickname given ...,"{'action': 0.8266003131866455, 'horror': 0.081...",0.8266,0.078628,0.081999,0.012773,action,0.8266
7,"It focuses mainly on Emerald City, an experime...","{'action': 0.7694476246833801, 'comedy': 0.139...",0.769448,0.1399,0.083226,0.007427,action,0.769448
8,"Em City is home to many..Aryans, Muslims, gang...","{'action': 0.8719258308410645, 'comedy': 0.072...",0.871926,0.072374,0.047274,0.008426,action,0.871926
9,I would say the main appeal of the show is due...,"{'action': 0.7466148138046265, 'comedy': 0.144...",0.746615,0.14496,0.071596,0.03683,action,0.746615
