Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Keyword Extraction Feature #1504

Merged
merged 4 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 32 additions & 65 deletions NLP/dummysentence.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,33 @@
import os
from sentence_transformers import SentenceTransformer, util

MODEL_NAME = 'all-MiniLM-L6-v2'
MODEL_FOLDER = 'model'

def load_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return [line.strip() for line in file if line.strip()]

def load_or_download_model():
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
if os.path.exists(model_path):
print(f"Loading model from {model_path}")
return SentenceTransformer(model_path)
else:
print(f"Downloading model {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
os.makedirs(MODEL_FOLDER, exist_ok=True)
model.save(model_path)
print(f"Model saved to {model_path}")
return model

def find_similar_sentences(query, file_path, top_n=5):
# Load the pre-trained model
model = load_or_download_model()

# Load and encode the sentences from the file
sentences = load_file(file_path)
sentence_embeddings = model.encode(sentences)

# Encode the query
query_embedding = model.encode([query])

# Calculate cosine similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]

# Get top N results
top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n]

return top_results

def main():
print("Welcome to the Sentence Similarity Search Tool!")

# Get user input for query
query = input("Enter your query: ")
import ollama as client

# Function to get response from Ollama API with system prompt
def get_ollama_response(sentence_number):
system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point."
stream = client.chat(
model="llama3.2",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."}
],
stream=True
)

# Get user input for file path
file_name = input("Enter the name of your text file (without .txt extension): ")
file_path = f"{file_name}.txt"

try:
results = find_similar_sentences(query, file_path)

print(f"\nTop 5 similar sentences for query: '{query}'\n")
for sentence, score in results:
print(f"Similarity: {score:.4f}")
print(f"Sentence: {sentence}\n")
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.")
except Exception as e:
print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
main()
response = ''
for chunk in stream:
response += chunk['message']['content']
return response.strip() # Strip any leading/trailing spaces

# Open the file in write mode
with open("generated_sentences.txt", "w") as file:
# Loop to generate 100 sentences one by one
for i in range(100):
# Get the sentence using the function
sentence = get_ollama_response(i + 1)

# Write the sentence to the file on a new line
file.write(sentence + "\n")

# Print the sentence to the console
print(f"Sentence {i+1}: {sentence}")

print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.")
122 changes: 122 additions & 0 deletions NLP/textsummary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

MODEL_NAME = 'all-MiniLM-L6-v2'
MODEL_FOLDER = 'model'
NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data')

def load_or_download_model():
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
if os.path.exists(model_path):
print(f"Loading model from {model_path}")
return SentenceTransformer(model_path)
else:
print(f"Downloading model {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
os.makedirs(MODEL_FOLDER, exist_ok=True)
model.save(model_path)
print(f"Model saved to {model_path}")
return model

def download_nltk_resources():
nltk.data.path.append(NLTK_DATA_FOLDER)
os.makedirs(NLTK_DATA_FOLDER, exist_ok=True)

resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')]
for resource, folder in resources:
try:
nltk.data.find(f'{folder}/{resource}')
print(f"{resource} is being Loaded.")
except LookupError:
print(f"Downloading {resource}...")
nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True)

def extract_keywords(text, model, top_n=10):
# Tokenize the text
words = word_tokenize(text.lower())

# Remove stopwords and non-alphanumeric tokens
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

# Count word frequencies
word_freq = Counter(filtered_words)

# Get unique words
unique_words = list(set(filtered_words))

# Get word embeddings
word_embeddings = model.encode(unique_words)

# Calculate importance scores
importance_scores = np.mean(word_embeddings, axis=1)

# Combine frequency and importance
combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)]

# Sort by combined score and get top N
top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n]

return [word for word, _ in top_keywords]

def summarize_text(text, model, num_sentences=3):
# Split the text into sentences
sentences = sent_tokenize(text)

# Encode sentences
sentence_embeddings = model.encode(sentences)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(sentence_embeddings)

# Calculate sentence scores
sentence_scores = np.sum(similarity_matrix, axis=1)

# Get top sentences
top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

return ' '.join(top_sentences)

def main():
# Ensure NLTK resources are downloaded
download_nltk_resources()

# Load or download the model
model = load_or_download_model()

# Read input file
input_file = 'input.txt'
if not os.path.exists(input_file):
print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
return

try:
with open(input_file, 'r', encoding='utf-8') as file:
text = file.read()
except Exception as e:
print(f"Error reading {input_file}: {str(e)}")
return

# Extract keywords
keywords = extract_keywords(text, model)

# Generate summary
summary = summarize_text(text, model)

# Print results
print("Keywords:")
for i, word in enumerate(keywords, 1):
print(f"{i}. {word}")

print("\nSummary:")
print(summary)

if __name__ == "__main__":
main()
Loading