In [None]:
import re
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import tarfile
import urllib.request
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
# IMDb dataset URL
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_path = './aclImdb_v1.tar.gz'
extract_path = './aclImdb'
# Download and extract the dataset
if not os.path.exists(download_path):
    urllib.request.urlretrieve(url, download_path)
if not os.path.exists(extract_path):
    with tarfile.open(download_path, 'r:gz') as tar:
    tar.extractall(extract_path)
# Load IMDb dataset
max_words = 10000
max_len = 200
embedding_dim = 128
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)
# Pad sequences to a fixed length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
# Train the model
batch_size = 64
epochs = 5
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
          validation_data=(x_test, y_test))
# Evaluate the model
# Convert probabilities to binary predictions
y_pred = (model.predict(x_test) > 0.5).astype('int32')
# Calculate confusion matrix and accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
!pip install wordcloud
# IMDb dataset URL
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_path = './aclImdb_v1.tar.gz'
extract_path = './aclImdb'
# Download and extract the dataset
if not os.path.exists(download_path):
    urllib.request.urlretrieve(url, download_path)
if not os.path.exists(extract_path):
    with tarfile.open(download_path, 'r:gz') as tar:
    tar.extractall(extract_path)
# Load IMDb dataset
max_words = 10000
max_len = 200
embedding_dim = 128
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)
# Convert indices back to text
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key)
                          for (key, value) in word_index.items()])
x_train_text = [' '.join([reverse_word_index.get(i - 3, '?')
                         for i in review]) for review in x_train]
x_test_text = [' '.join([reverse_word_index.get(i - 3, '?')
                        for i in review]) for review in x_test]
# Text Preprocessing


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stop_words = set(['the', 'and', 'of', 'to', 'is', 'in', 'it', 'that',
                     'was', 'for', 'on', 'with', 'as', 'at', 'by', 'but', 'not'])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


x_train_text = [preprocess_text(text) for text in x_train_text]
x_test_text = [preprocess_text(text) for text in x_test_text]
# Tokenization
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train_text)
x_train_seq = tokenizer.texts_to_sequences(x_train_text)
x_test_seq = tokenizer.texts_to_sequences(x_test_text)
# Pad sequences to a fixed length
x_train = pad_sequences(x_train_seq, maxlen=max_len)
x_test = pad_sequences(x_test_seq, maxlen=max_len)
# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
# Train the model
batch_size = 64
epochs = 5
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
          validation_data=(x_test, y_test))
# Create a Word Cloud from the training text
all_text = ' '.join(x_train_text)
wordcloud = WordCloud(width=800, height=400, random_state=21,
                      max_font_size=110).generate(all_text)
# Plot the Word Cloud
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
# Evaluate the model
# Convert probabilities to binary predictions
y_pred = (model.predict(x_test) > 0.5).astype('int32')
# Calculate confusion matrix and accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
