# Program 8 # # Use text mining techniques to analyse a collection of news articles. Identify the most # frequent terms and perform topic modelling using Latent Dirichlet Allocation (LDA) to # find hidden topics within the articles. !pip install gensim pyLDAvis # Step 1: Import Libraries import os import matplotlib.pyplot as plt from sklearn.datasets import fetch_20newsgroups from nltk.corpus import stopwords from collections import Counter from gensim import corpora from gensim.models import LdaMulticore from gensim.utils import simple_preprocess import pyLDAvis.gensim_models as gensimvis import pyLDAvis import nltk # Step 2: Download Stopwords nltk.download('stopwords') # Step 3: Load Dataset (subset for faster testing) newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) texts = newsgroups.data[:2000] # limit docs for faster run # Step 4: Preprocessing Function (fast) def preprocess(text): return [word for word in simple_preprocess(text) if word not in stopwords.words('english')] # Step 5: Apply Preprocessing processed_texts = [preprocess(text) for text in texts] # Step 6: Frequent Terms Plot all_tokens = [token for sublist in processed_texts for token in sublist] freq_dist = Counter(all_tokens) top_terms = freq_dist.most_common(20) terms, freqs = zip(*top_terms) plt.figure(figsize=(10, 6)) plt.barh(terms[::-1], freqs[::-1]) plt.title("Top 20 Most Frequent Terms (20 Newsgroups)") plt.xlabel("Frequency") plt.show() # Step 7: Prepare Data for LDA dictionary = corpora.Dictionary(processed_texts) dictionary.filter_extremes(no_below=5, no_above=0.5) # shrink vocab for speed corpus = [dictionary.doc2bow(text) for text in processed_texts] # Step 8: Train LDA Model (multicore, no logging) lda_model = LdaMulticore( corpus=corpus, id2word=dictionary, num_topics=5, passes=5, # reduced passes for faster testing workers=min(4, os.cpu_count()), # auto-detect CPU cores random_state=42 ) # Step 9: Print Topics print("\nIdentified Topics:") for i, topic in lda_model.print_topics(num_words=5): print(f"Topic {i+1}: {topic}") # Step 10: Visualize Topics (in Jupyter/Colab) pyLDAvis.enable_notebook() vis = gensimvis.prepare(lda_model, corpus, dictionary) pyLDAvis.display(vis)