from datasets import load_dataset import re import random # Load the IMDb dataset dataset = load_dataset('imdb') # Get the training set reviews train_reviews = dataset['train']['text'] # Function to find sentences containing a selected word def find_sentences_with_word(reviews, word, num_sentences=50): word = word.lower() word_pattern = re.compile(rf'\b{word}\b') sentences_with_word = [] # Shuffle the list of reviews to select sentences at random random.shuffle(reviews) for review in reviews: # Remove

clean_review = re.sub(r'', ' ', review) # Split into sentences sentences = re.split(r'[.!?]', clean_review) for sentence in sentences: if len(sentences_with_word) >= num_sentences: return sentences_with_word if word_pattern.search(sentence.lower()): sentences_with_word.append(sentence.strip()) return sentences_with_word # Selected word to search for selected_word = 'make' # Find the first 50 sentences containing the selected word in the training set sentences_with_selected_word = find_sentences_with_word(train_reviews, selected_word) # Write the sentences to a text file file_name = f"reviews_{selected_word}.txt" with open(file_name, 'w') as file: for i, sentence in enumerate(sentences_with_selected_word): file.write(f"Sentence {i+1}:\n{sentence}\n\n") print(f"Sentences containing the word '{selected_word}' have been written to {file_name}")