Terdapat tiga poin yang akan kita bahas kali ini, yakni:
- EDA and Textual content Processing
- Phrase Cloud Visualization
- Mannequin Constructing and Analysis
Steps in EDA and Textual content Processing
- Memeriksa lacking values
- Mencari distribusi kelas (steadiness/imbalance)
- Menghapus karakter spesial
- Distribusi panjang headlines
- Distribusi panjang headlines: menghapus outlier
- Filtering: mencari kalimat yang berisi angka:
- Analisis sampel yang berisi angka berupa waktu, tanggal, atau tipe cardinal entity.
- 10 random sampel: date entity
- 10 random sampel: time entity
- 10 random sampel: cardinal entity
Terdapat perbedaan wordcloud pada sarcastic dan real headlines. Semakin besar ukuran kata pada wordcloud, semakin sering kata tersebut muncul pada headlines (frekuensinya tinggi).
Textual content Classification: Utilizing Transformers Encoder Block
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.specific as px
from plotly.offline import init_notebook_mode
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import spacytqdm.pandas()
spacy_eng = spacy.load("en_core_web_sm")
nltk.obtain('stopwords')
lemm = WordNetLemmatizer()
init_notebook_mode(linked=True)
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (20,8)
plt.rcParams['font.size'] = 18
data1 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', traces=True)
data2 = pd.read_json('Sarcasm_Headlines_Dataset.json', traces=True)
data1 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', traces=True)
data2 = pd.read_json('Sarcasm_Headlines_Dataset.json', traces=True)data1 = data1[['headline','is_sarcastic']]
data2 = data2[['headline','is_sarcastic']]knowledge = pd.concat([data1,data2])
knowledge.reset_index(drop=True, inplace=True)
knowledge
Output:
EDA and Textual content Preprocessing
- Checking for Lacking Values
knowledge.isnull().sum()'''
OUTPUT
headline 0
is_sarcastic 0
dtype: int64
'''
- Discovering the Courses Steadiness/Imbalance
px.bar(knowledge.groupby('is_sarcastic').depend().reset_index(), x='headline',title='Depend of Sarcastic and Real Headlines')
knowledge['is_sarcastic'].value_counts()
- Particular Characters Elimination
stop_words = stopwords.phrases('english')
stop_words.take away('not')def text_cleaning(x):
headline = re.sub('s+n+', ' ', x)
headline = re.sub('[^a-zA-Z0-9]', ' ', x)
headline = headline.decrease()
headline = headline.cut up()
headline = [lemm.lemmatize(word, "v") for word in headline if not word in stop_words]
headline = ' '.be part of(headline)
return headline
def get_entities(x):
entity = []
textual content = spacy_eng(x)
for phrase in textual content.ents:
entity.append(phrase.label_)
return ",".be part of(entity)knowledge['entity'] = knowledge['headline'].progress_apply(get_entities)
nltk.obtain('wordnet')knowledge['clean_headline'] = knowledge['headline'].apply(text_cleaning)
knowledge['sentence_length'] = knowledge['clean_headline'].apply(lambda x: len(x.cut up()))
knowledge
Output:
- Headlines Size Distribution
px.histogram(knowledge, x="sentence_length",peak=700, shade='is_sarcastic', title="Headlines Size Distribution", marginal="field")
knowledge.drop(knowledge[data['sentence_length'] == 107].index, inplace = True)
Output:
knowledge.reset_index(inplace=True, drop=True)
- Headlines Size Distribution: Outliers Eliminated
px.histogram(knowledge, x="sentence_length",peak=700, shade='is_sarcastic', title="Headlines Size Distribution", marginal="field")
- Filtering: Discover Sentences that Include Numbers
knowledge['contains_number'] = knowledge['clean_headline'].apply(lambda x: bool(re.search(r'd+', x)))
knowledge
Output:
Evaluation of Samples Containing Numbers of Time, Date, or Cardinal Entity
- 10 Random Samples: Date entity
knowledge[(data['contains_number']) & (knowledge['sentence_length']<=5) & (knowledge['entity']=='DATE')].pattern(10)
Output:
- 10 Random Samples: Time Entity
knowledge[(data['contains_number']) & (knowledge['sentence_length']<=5) & (knowledge['entity']=='TIME')].pattern(10)
Output:
- 10 Random Samples: Cardinal Entity
knowledge[(data['contains_number']) & (knowledge['sentence_length']<=5) & (knowledge['entity']=='CARDINAL')].pattern(10)
Output:
Phrase Visualization: Phrase Clouds
sarcastic = knowledge[data['is_sarcastic']==1]['clean_headline'].tolist()
real = knowledge[data['is_sarcastic']==0]['clean_headline'].tolist()
- High 50 Phrases: Sarcastic Headlines
wordcloud = WordCloud(max_words=50, width=600, background_color='white').generate(" ".be part of(sarcastic))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.present()
Output:
- High 50 Phrases: Real Headlines
wordcloud = WordCloud(max_words=50, width=600, background_color='white').generate(" ".be part of(real))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.present()
Output:
Mannequin Constructing
import tensorflow as tf
from tensorflow.keras.preprocessing.textual content import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.fashions import Sequential, Mannequin
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, MultiHeadAttention, LayerNormalization, Enter, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
sentences = knowledge['clean_headline']
label = knowledge['is_sarcastic']
- Practice- Validation-Check Splitting (80:10:10)
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, stratify=y_val, random_state=42)
max_len = 20
oov_token = '00_V'
padding_type = 'put up'
trunc_type = 'put up'tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Measurement: ",vocab_size)
''' OUTPUT: Vocab Measurement: 20886''''
train_sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)val_sequences = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(val_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
Transformers: Consideration is all you want
class TransformerEncoder(layers.Layer):
def __init__(self, embed_dim, heads, neurons):
tremendous(TransformerEncoder, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
self.ffn = Sequential(
[layers.Dense(neurons, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(0.5)
self.dropout2 = layers.Dropout(0.5)def name(self, inputs, coaching):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, coaching=coaching)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, coaching=coaching)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
tremendous(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def name(self, x):
maxlen = tf.form(x)[-1]
positions = tf.vary(begin=0, restrict=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
embed_dim = 50
heads = 2
neurons = 32
maxlen = 20
vocab_size = 20886inputs = layers.Enter(form=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerEncoder(embed_dim, heads, neurons)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = Dropout(0.35)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
mannequin = Mannequin(inputs=inputs, outputs=outputs)
mannequin.compile(optimizer=tf.keras.optimizers.Adam(0.0003), loss='binary_crossentropy', metrics=['accuracy'])
mannequin.abstract()
Output:
model_name = "mannequin.h5"
checkpoint = ModelCheckpoint(model_name,
monitor="val_loss",
mode="min",
save_best_only = True,
verbose=1)earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0.001, endurance = 1, verbose = 1)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
endurance=3,
verbose=1,
issue=0.2,
min_lr=0.00000001)
historical past = mannequin.match(X_train,y_train,
validation_data=(X_val,y_val),
epochs=25,
batch_size=32,
callbacks=[earlystopping])
Mannequin Analysis
plt.determine(figsize=(20,8))
plt.plot(historical past.historical past['loss'])
plt.plot(historical past.historical past['val_loss'])
plt.title('mannequin loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='higher left')
plt.present()
Output:
plt.determine(figsize=(20,8))
plt.plot(historical past.historical past['accuracy'])
plt.plot(historical past.historical past['val_accuracy'])
plt.title('mannequin accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='higher left')
plt.present()
Output:
- Classification Metrics: ROC Curve
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
y_pred = mannequin.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="auc="+str(auc),lw=2)
plt.plot([0, 1], [0, 1], shade="orange", lw=2, linestyle="--")
plt.legend(loc=4)
plt.present()
Output:
- Classification Metrics: Rating
y_pred[y_pred>=0.85] = 1
y_pred[y_pred<0.85] = 0print(classification_report(y_test, y_pred))
Output:
- Classification Metrics: Confusion Matrix
plt.determine(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.4g',cmap='viridis')
Output:
Be taught Extra through myskill.id/course/transformer-hands-on