from datasets import load_dataset
import re
import random
# Load the IMDb dataset
dataset = load_dataset('imdb')
# Get the training set reviews
train_reviews = dataset['train']['text']
# Function to find sentences containing a selected word
def find_sentences_with_word(reviews, word, num_sentences=50):
word = word.lower()
word_pattern = re.compile(rf'\b{word}\b')
sentences_with_word = []
# Shuffle the list of reviews to select sentences at random
random.shuffle(reviews)
for review in reviews:
# Remove
clean_review = re.sub(r'
', ' ', review)
# Split into sentences
sentences = re.split(r'[.!?]', clean_review)
for sentence in sentences:
if len(sentences_with_word) >= num_sentences:
return sentences_with_word
if word_pattern.search(sentence.lower()):
sentences_with_word.append(sentence.strip())
return sentences_with_word
# Selected word to search for
selected_word = 'make'
# Find the first 50 sentences containing the selected word in the training set
sentences_with_selected_word = find_sentences_with_word(train_reviews, selected_word)
# Write the sentences to a text file
file_name = f"reviews_{selected_word}.txt"
with open(file_name, 'w') as file:
for i, sentence in enumerate(sentences_with_selected_word):
file.write(f"Sentence {i+1}:\n{sentence}\n\n")
print(f"Sentences containing the word '{selected_word}' have been written to {file_name}")