Terdapat tiga poin yang akan kita bahas kali ini, yakni:
- EDA and Textual content material Processing
- Phrase Cloud Visualization
- Model Developing and Evaluation
Steps in EDA and Textual content material Processing
- Memeriksa missing values
- Mencari distribusi kelas (steadiness/imbalance)
- Menghapus karakter spesial
- Distribusi panjang headlines
- Distribusi panjang headlines: menghapus outlier
- Filtering: mencari kalimat yang berisi angka:
- Analisis sampel yang berisi angka berupa waktu, tanggal, atau tipe cardinal entity.
- 10 random sampel: date entity
- 10 random sampel: time entity
- 10 random sampel: cardinal entity
Terdapat perbedaan wordcloud pada sarcastic dan actual headlines. Semakin besar ukuran kata pada wordcloud, semakin sering kata tersebut muncul pada headlines (frekuensinya tinggi).
Textual content material Classification: Using Transformers Encoder Block
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.particular as px
from plotly.offline import init_notebook_mode
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import spacytqdm.pandas()
spacy_eng = spacy.load("en_core_web_sm")
nltk.get hold of('stopwords')
lemm = WordNetLemmatizer()
init_notebook_mode(linked=True)
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (20,8)
plt.rcParams['font.size'] = 18
data1 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', traces=True)
data2 = pd.read_json('Sarcasm_Headlines_Dataset.json', traces=True)
data1 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', traces=True)
data2 = pd.read_json('Sarcasm_Headlines_Dataset.json', traces=True)data1 = data1[['headline','is_sarcastic']]
data2 = data2[['headline','is_sarcastic']]data = pd.concat([data1,data2])
data.reset_index(drop=True, inplace=True)
data
Output:
EDA and Textual content material Preprocessing
- Checking for Missing Values
data.isnull().sum()'''
OUTPUT
headline 0
is_sarcastic 0
dtype: int64
'''
- Discovering the Programs Steadiness/Imbalance
px.bar(data.groupby('is_sarcastic').rely().reset_index(), x='headline',title='Rely of Sarcastic and Actual Headlines')
data['is_sarcastic'].value_counts()
- Explicit Characters Elimination
stop_words = stopwords.phrases('english')
stop_words.take away('not')def text_cleaning(x):
headline = re.sub('s+n+', ' ', x)
headline = re.sub('[^a-zA-Z0-9]', ' ', x)
headline = headline.lower()
headline = headline.lower up()
headline = [lemm.lemmatize(word, "v") for word in headline if not word in stop_words]
headline = ' '.be a part of(headline)
return headline
def get_entities(x):
entity = []
textual content material = spacy_eng(x)
for phrase in textual content material.ents:
entity.append(phrase.label_)
return ",".be a part of(entity)data['entity'] = data['headline'].progress_apply(get_entities)
nltk.get hold of('wordnet')data['clean_headline'] = data['headline'].apply(text_cleaning)
data['sentence_length'] = data['clean_headline'].apply(lambda x: len(x.lower up()))
data
Output:
- Headlines Dimension Distribution
px.histogram(data, x="sentence_length",peak=700, shade='is_sarcastic', title="Headlines Dimension Distribution", marginal="subject")
data.drop(data[data['sentence_length'] == 107].index, inplace = True)
Output:
data.reset_index(inplace=True, drop=True)
- Headlines Dimension Distribution: Outliers Eradicated
px.histogram(data, x="sentence_length",peak=700, shade='is_sarcastic', title="Headlines Dimension Distribution", marginal="subject")
- Filtering: Uncover Sentences that Embody Numbers
data['contains_number'] = data['clean_headline'].apply(lambda x: bool(re.search(r'd+', x)))
data
Output:
Analysis of Samples Containing Numbers of Time, Date, or Cardinal Entity
- 10 Random Samples: Date entity
data[(data['contains_number']) & (data['sentence_length']<=5) & (data['entity']=='DATE')].sample(10)
Output:
- 10 Random Samples: Time Entity
data[(data['contains_number']) & (data['sentence_length']<=5) & (data['entity']=='TIME')].sample(10)
Output:
- 10 Random Samples: Cardinal Entity
data[(data['contains_number']) & (data['sentence_length']<=5) & (data['entity']=='CARDINAL')].sample(10)
Output:
Phrase Visualization: Phrase Clouds
sarcastic = data[data['is_sarcastic']==1]['clean_headline'].tolist()
actual = data[data['is_sarcastic']==0]['clean_headline'].tolist()
- Excessive 50 Phrases: Sarcastic Headlines
wordcloud = WordCloud(max_words=50, width=600, background_color='white').generate(" ".be a part of(sarcastic))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.current()
Output:
- Excessive 50 Phrases: Actual Headlines
wordcloud = WordCloud(max_words=50, width=600, background_color='white').generate(" ".be a part of(actual))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.current()
Output:
Model Developing
import tensorflow as tf
from tensorflow.keras.preprocessing.textual content material import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.fashions import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Enter, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
sentences = data['clean_headline']
label = data['is_sarcastic']
- Follow- Validation-Verify Splitting (80:10:10)
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, stratify=y_val, random_state=42)
max_len = 20
oov_token = '00_V'
padding_type = 'put up'
trunc_type = 'put up'tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Measurement: ",vocab_size)
''' OUTPUT: Vocab Measurement: 20886''''
train_sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)val_sequences = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(val_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
Transformers: Consideration is all you need
class TransformerEncoder(layers.Layer):
def __init__(self, embed_dim, heads, neurons):
large(TransformerEncoder, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
self.ffn = Sequential(
[layers.Dense(neurons, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(0.5)
self.dropout2 = layers.Dropout(0.5)def identify(self, inputs, teaching):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, teaching=teaching)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, teaching=teaching)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
large(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def identify(self, x):
maxlen = tf.kind(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
embed_dim = 50
heads = 2
neurons = 32
maxlen = 20
vocab_size = 20886inputs = layers.Enter(kind=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerEncoder(embed_dim, heads, neurons)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = Dropout(0.35)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(0.0003), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
Output:
model_name = "model.h5"
checkpoint = ModelCheckpoint(model_name,
monitor="val_loss",
mode="min",
save_best_only = True,
verbose=1)earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0.001, endurance = 1, verbose = 1)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
endurance=3,
verbose=1,
challenge=0.2,
min_lr=0.00000001)
historic previous = model.match(X_train,y_train,
validation_data=(X_val,y_val),
epochs=25,
batch_size=32,
callbacks=[earlystopping])
Model Evaluation
plt.decide(figsize=(20,8))
plt.plot(historic previous.historic previous['loss'])
plt.plot(historic previous.historic previous['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='larger left')
plt.current()
Output:
plt.decide(figsize=(20,8))
plt.plot(historic previous.historic previous['accuracy'])
plt.plot(historic previous.historic previous['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='larger left')
plt.current()
Output:
- Classification Metrics: ROC Curve
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
y_pred = model.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="auc="+str(auc),lw=2)
plt.plot([0, 1], [0, 1], shade="orange", lw=2, linestyle="--")
plt.legend(loc=4)
plt.current()
Output:
- Classification Metrics: Score
y_pred[y_pred>=0.85] = 1
y_pred[y_pred<0.85] = 0print(classification_report(y_test, y_pred))
Output:
- Classification Metrics: Confusion Matrix
plt.decide(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.4g',cmap='viridis')
Output:
Be taught Additional via myskill.id/course/transformer-hands-on