Spaces:

Agents-MCP-Hackathon
/

iykyk-product-validation-agent

Running

App Files Files Community

iykyk-product-validation-agent / utils /nlp_query_enhancer.py

chuckiykyk

Upload 36 files

72f802a verified about 1 month ago

raw

history blame contribute delete

25.8 kB

	"""
	NLP Query Enhancer
	Advanced query processing using spaCy and NLTK for better search results
	"""

	import spacy
	import nltk
	from nltk.corpus import wordnet
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from typing import List, Dict, Set, Tuple, Any
	import re
	import logging
	from collections import defaultdict, Counter
	import asyncio
	import aiohttp

	logger = logging.getLogger(__name__)

	class NLPQueryEnhancer:
	"""Enhanced query processing using advanced NLP techniques"""

	def __init__(self):
	"""Initialize NLP models and resources"""
	self.nlp = None
	self.sentence_model = None
	self.lemmatizer = None
	self.stop_words = None
	self._initialized = False

	# Industry-specific term mappings
	self.industry_ontology = {
	'technology': {
	'ai': ['artificial intelligence', 'machine learning', 'deep learning', 'neural networks'],
	'app': ['application', 'software', 'mobile app', 'web app', 'platform'],
	'api': ['application programming interface', 'web service', 'endpoint'],
	'saas': ['software as a service', 'cloud software', 'subscription software'],
	'iot': ['internet of things', 'connected devices', 'smart devices'],
	'blockchain': ['distributed ledger', 'cryptocurrency', 'smart contracts']
	},
	'business': {
	'startup': ['new business', 'entrepreneur', 'venture', 'company'],
	'revenue': ['income', 'earnings', 'sales', 'profit'],
	'customer': ['client', 'user', 'consumer', 'buyer'],
	'market': ['industry', 'sector', 'segment', 'niche'],
	'competition': ['competitor', 'rival', 'alternative', 'substitute']
	},
	'health': {
	'fitness': ['exercise', 'workout', 'training', 'physical activity'],
	'nutrition': ['diet', 'food', 'eating', 'meal planning'],
	'wellness': ['health', 'wellbeing', 'lifestyle', 'self-care'],
	'medical': ['healthcare', 'clinical', 'treatment', 'diagnosis']
	},
	'finance': {
	'fintech': ['financial technology', 'digital banking', 'payment systems'],
	'investment': ['portfolio', 'trading', 'stocks', 'assets'],
	'banking': ['financial services', 'credit', 'loans', 'deposits'],
	'insurance': ['coverage', 'policy', 'claims', 'risk management']
	}
	}

	async def initialize(self):
	"""Initialize NLP models asynchronously"""
	if self._initialized:
	return

	try:
	logger.info("Initializing NLP models...")

	# Download required NLTK data
	await self._download_nltk_data()

	# Initialize spaCy model
	try:
	self.nlp = spacy.load("en_core_web_sm")
	except OSError:
	logger.warning("spaCy model 'en_core_web_sm' not found. Using basic tokenization.")
	self.nlp = None

	# Initialize sentence transformer
	try:
	self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
	except Exception as e:
	logger.warning(f"Could not load sentence transformer: {e}")
	self.sentence_model = None

	# Initialize NLTK components
	self.lemmatizer = WordNetLemmatizer()
	self.stop_words = set(stopwords.words('english'))

	self._initialized = True
	logger.info("NLP models initialized successfully")

	except Exception as e:
	logger.error(f"Error initializing NLP models: {e}")
	# Set basic fallbacks
	self.lemmatizer = WordNetLemmatizer() if 'WordNetLemmatizer' in locals() else None
	self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}

	async def _download_nltk_data(self):
	"""Download required NLTK data"""
	required_data = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4']

	for data in required_data:
	try:
	nltk.data.find(f'tokenizers/{data}')
	except LookupError:
	try:
	nltk.download(data, quiet=True)
	except Exception as e:
	logger.warning(f"Could not download NLTK data '{data}': {e}")

	async def enhance_query(self, feature_description: str, target_market: str = "") -> Dict[str, Any]:
	"""
	Enhance a feature description into multiple optimized search queries

	Args:
	feature_description: Original feature description
	target_market: Target market description

	Returns:
	Enhanced query data with multiple search strategies
	"""
	await self.initialize()

	try:
	# Extract key concepts using NLP
	concepts = await self._extract_key_concepts(feature_description)

	# Generate semantic variations
	semantic_queries = await self._generate_semantic_queries(feature_description)

	# Expand with synonyms and related terms
	expanded_queries = await self._expand_with_synonyms(concepts)

	# Add industry-specific terms
	industry_queries = await self._add_industry_terms(concepts, target_market)

	# Generate market-specific queries
	market_queries = await self._generate_market_queries(feature_description, target_market)

	# Combine and rank queries
	all_queries = {
	'core_concepts': concepts['core_terms'],
	'semantic_variations': semantic_queries,
	'synonym_expansions': expanded_queries,
	'industry_specific': industry_queries,
	'market_focused': market_queries,
	'combined_queries': self._combine_queries(concepts, target_market)
	}

	return {
	'original_description': feature_description,
	'target_market': target_market,
	'extracted_concepts': concepts,
	'enhanced_queries': all_queries,
	'query_metadata': {
	'total_queries': sum(len(v) if isinstance(v, list) else 1 for v in all_queries.values()),
	'confidence_score': self._calculate_confidence(concepts),
	'processing_method': 'advanced_nlp' if self.nlp else 'basic_processing'
	}
	}

	except Exception as e:
	logger.error(f"Error enhancing query: {e}")
	# Fallback to basic processing
	return await self._basic_query_enhancement(feature_description, target_market)

	async def _extract_key_concepts(self, text: str) -> Dict[str, Any]:
	"""Extract key concepts using spaCy NLP"""
	concepts = {
	'core_terms': [],
	'entities': [],
	'technologies': [],
	'business_terms': [],
	'action_verbs': [],
	'descriptors': []
	}

	if self.nlp:
	# Use spaCy for advanced processing
	doc = self.nlp(text)

	# Extract named entities
	for ent in doc.ents:
	if ent.label_ in ['ORG', 'PRODUCT', 'TECHNOLOGY']:
	concepts['entities'].append(ent.text.lower())

	# Extract key terms by POS tags
	for token in doc:
	if token.is_stop or token.is_punct or len(token.text) < 3:
	continue

	lemma = token.lemma_.lower()

	if token.pos_ == 'NOUN' and token.dep_ in ['nsubj', 'dobj', 'pobj']:
	concepts['core_terms'].append(lemma)
	elif token.pos_ == 'VERB' and token.tag_ not in ['MD', 'VBZ']: # Not auxiliary verbs
	concepts['action_verbs'].append(lemma)
	elif token.pos_ == 'ADJ':
	concepts['descriptors'].append(lemma)

	# Extract noun phrases
	for chunk in doc.noun_chunks:
	if len(chunk.text.split()) <= 3: # Keep phrases short
	concepts['core_terms'].append(chunk.text.lower())

	else:
	# Fallback to basic processing
	words = word_tokenize(text.lower())
	if self.lemmatizer:
	words = [self.lemmatizer.lemmatize(word) for word in words if word.isalpha()]

	# Filter out stop words
	words = [word for word in words if word not in self.stop_words and len(word) > 2]
	concepts['core_terms'] = words[:10] # Limit to top 10

	# Categorize terms
	concepts = self._categorize_terms(concepts)

	# Remove duplicates and sort by importance
	for key in concepts:
	if isinstance(concepts[key], list):
	concepts[key] = list(dict.fromkeys(concepts[key]))[:5] # Top 5 per category

	return concepts

	def _categorize_terms(self, concepts: Dict[str, Any]) -> Dict[str, Any]:
	"""Categorize terms into technology, business, etc."""
	tech_keywords = {'ai', 'app', 'software', 'platform', 'api', 'system', 'tool', 'technology', 'digital', 'online', 'mobile', 'web'}
	business_keywords = {'market', 'customer', 'business', 'service', 'product', 'solution', 'company', 'industry', 'revenue', 'profit'}

	all_terms = concepts['core_terms'] + concepts['entities']

	for term in all_terms:
	term_lower = term.lower()
	if any(tech in term_lower for tech in tech_keywords):
	concepts['technologies'].append(term)
	elif any(biz in term_lower for biz in business_keywords):
	concepts['business_terms'].append(term)

	return concepts

	async def _generate_semantic_queries(self, text: str) -> List[str]:
	"""Generate semantically similar queries using sentence transformers"""
	if not self.sentence_model:
	return []

	try:
	# Generate variations by paraphrasing key concepts
	base_embedding = self.sentence_model.encode([text])

	# Create variations by replacing key terms with synonyms
	variations = []
	words = text.split()

	for i, word in enumerate(words):
	if len(word) > 4 and word.lower() not in self.stop_words:
	# Try to find synonyms
	synonyms = self._get_wordnet_synonyms(word)
	for synonym in synonyms[:2]: # Limit to 2 synonyms per word
	new_text = words.copy()
	new_text[i] = synonym
	variations.append(' '.join(new_text))

	return variations[:5] # Return top 5 variations

	except Exception as e:
	logger.warning(f"Error generating semantic queries: {e}")
	return []

	async def _expand_with_synonyms(self, concepts: Dict[str, Any]) -> List[str]:
	"""Expand core terms with synonyms using WordNet"""
	expanded = []

	for term in concepts['core_terms'][:5]: # Limit to top 5 terms
	synonyms = self._get_wordnet_synonyms(term)
	if synonyms:
	# Create queries with synonyms
	expanded.extend(synonyms[:3]) # Top 3 synonyms per term

	return list(set(expanded)) # Remove duplicates

	def _get_wordnet_synonyms(self, word: str) -> List[str]:
	"""Get synonyms from WordNet"""
	synonyms = set()

	try:
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	synonym = lemma.name().replace('_', ' ')
	if synonym.lower() != word.lower() and len(synonym) > 2:
	synonyms.add(synonym)
	except Exception as e:
	logger.debug(f"Error getting synonyms for '{word}': {e}")

	return list(synonyms)[:5] # Limit to 5 synonyms

	async def _add_industry_terms(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
	"""Add industry-specific terminology"""
	industry_queries = []

	# Detect industry from concepts and target market
	detected_industries = self._detect_industries(concepts, target_market)

	for industry in detected_industries:
	if industry in self.industry_ontology:
	ontology = self.industry_ontology[industry]

	# Match concepts to industry terms
	for concept in concepts['core_terms'][:3]:
	for term, expansions in ontology.items():
	if concept.lower() in term or term in concept.lower():
	industry_queries.extend(expansions[:2]) # Add 2 expansions

	return list(set(industry_queries))[:8] # Limit to 8 industry terms

	def _detect_industries(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
	"""Detect relevant industries from concepts and target market"""
	industries = []
	all_text = ' '.join(concepts['core_terms'] + [target_market]).lower()

	# Simple keyword matching for industry detection
	industry_keywords = {
	'technology': ['app', 'software', 'ai', 'tech', 'digital', 'platform', 'api'],
	'health': ['fitness', 'health', 'medical', 'wellness', 'nutrition', 'exercise'],
	'business': ['business', 'startup', 'company', 'market', 'customer'],
	'finance': ['finance', 'money', 'payment', 'banking', 'investment']
	}

	for industry, keywords in industry_keywords.items():
	if any(keyword in all_text for keyword in keywords):
	industries.append(industry)

	return industries[:2] # Limit to 2 industries

	async def _generate_market_queries(self, feature_description: str, target_market: str) -> List[str]:
	"""Generate market-focused search queries"""
	if not target_market:
	return []

	market_queries = []

	# Extract key terms from feature description
	feature_words = [word for word in feature_description.split()
	if len(word) > 3 and word.lower() not in self.stop_words][:3]

	# Extract market segments
	market_words = [word for word in target_market.split()
	if len(word) > 3 and word.lower() not in self.stop_words][:3]

	# Combine feature + market
	for feature_word in feature_words:
	for market_word in market_words:
	market_queries.append(f"{feature_word} {market_word}")
	market_queries.append(f"{market_word} {feature_word}")

	# Add market-specific patterns
	market_patterns = [
	f"{target_market} solutions",
	f"{target_market} tools",
	f"{target_market} software",
	f"{target_market} apps",
	f"{target_market} technology"
	]

	market_queries.extend(market_patterns)

	return list(set(market_queries))[:6] # Limit to 6 market queries

	def _combine_queries(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
	"""Combine different concepts into comprehensive, contextually relevant queries"""
	combined = []

	core_terms = concepts['core_terms'][:3]
	technologies = concepts['technologies'][:2]
	business_terms = concepts['business_terms'][:2]

	# Create contextual phrase combinations (preserve meaning)
	if len(core_terms) >= 2:
	# Keep related terms together
	main_concept = ' '.join(core_terms[:2])
	combined.append(main_concept)

	# Add third term only if it's contextually related
	if len(core_terms) >= 3:
	extended_concept = ' '.join(core_terms[:3])
	# Only add if it makes semantic sense
	if len(extended_concept.split()) <= 4: # Avoid overly long phrases
	combined.append(extended_concept)

	# Combine technology + business terms meaningfully
	for tech in technologies:
	for biz in business_terms:
	# Create meaningful combinations
	tech_biz_combo = f"{tech} {biz}"
	# Ensure the combination makes sense (not just random word pairs)
	if self._is_meaningful_combination(tech, biz):
	combined.append(tech_biz_combo)

	# Add contextual market combinations
	if target_market:
	market_words = [word for word in target_market.split() if len(word) > 3][:2]
	for market_word in market_words:
	for term in core_terms[:2]:
	# Create market-specific queries that maintain context
	market_combo = f"{term} {market_word}"
	if self._is_meaningful_combination(term, market_word):
	combined.append(market_combo)

	# Remove duplicates and filter for relevance
	unique_combined = list(dict.fromkeys(combined)) # Preserve order while removing duplicates

	# Filter out combinations that are too generic or meaningless
	filtered_combined = [combo for combo in unique_combined if self._is_contextually_relevant(combo)]

	return filtered_combined[:5] # Limit to 5 most relevant combined queries

	def _is_meaningful_combination(self, term1: str, term2: str) -> bool:
	"""Check if two terms create a meaningful combination"""
	# Avoid combining very similar terms
	if term1.lower() == term2.lower():
	return False

	# Avoid combining terms that are substrings of each other
	if term1.lower() in term2.lower() or term2.lower() in term1.lower():
	return False

	# Check for semantic compatibility
	tech_terms = {'ai', 'ml', 'algorithm', 'neural', 'deep', 'machine', 'learning', 'data', 'model', 'system', 'network', 'automation', 'software', 'digital', 'technology', 'computer', 'intelligence'}
	business_terms = {'business', 'market', 'customer', 'service', 'product', 'solution', 'company', 'industry', 'enterprise', 'commercial', 'professional', 'management', 'strategy'}

	term1_is_tech = any(tech in term1.lower() for tech in tech_terms)
	term2_is_tech = any(tech in term2.lower() for tech in tech_terms)
	term1_is_business = any(biz in term1.lower() for biz in business_terms)
	term2_is_business = any(biz in term2.lower() for biz in business_terms)

	# Good combinations: tech + business, or related terms
	if (term1_is_tech and term2_is_business) or (term1_is_business and term2_is_tech):
	return True

	# Both are tech terms or both are business terms - can be good if not too similar
	if (term1_is_tech and term2_is_tech) or (term1_is_business and term2_is_business):
	return len(set(term1.lower().split()) & set(term2.lower().split())) == 0 # No overlapping words

	return True # Default to allowing the combination

	def _is_contextually_relevant(self, query: str) -> bool:
	"""Check if a query maintains contextual relevance"""
	words = query.lower().split()

	# Filter out queries that are too short or too long
	if len(words) < 1 or len(words) > 5:
	return False

	# Filter out queries with only stop words
	stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
	meaningful_words = [word for word in words if word not in stop_words and len(word) > 2]

	if len(meaningful_words) == 0:
	return False

	# Ensure at least one word is substantial (length > 3)
	if not any(len(word) > 3 for word in meaningful_words):
	return False

	return True

	def _calculate_confidence(self, concepts: Dict[str, Any]) -> float:
	"""Calculate confidence score for the extraction"""
	total_concepts = sum(len(v) if isinstance(v, list) else 0 for v in concepts.values())

	# Base confidence on number of extracted concepts
	if total_concepts >= 15:
	return 0.9
	elif total_concepts >= 10:
	return 0.8
	elif total_concepts >= 5:
	return 0.7
	else:
	return 0.6

	async def _basic_query_enhancement(self, feature_description: str, target_market: str) -> Dict[str, Any]:
	"""Fallback basic query enhancement when NLP models fail"""
	# Simple word extraction
	words = re.findall(r'\b\w{3,}\b', feature_description.lower())
	words = [word for word in words if word not in self.stop_words][:5]

	return {
	'original_description': feature_description,
	'target_market': target_market,
	'extracted_concepts': {'core_terms': words},
	'enhanced_queries': {
	'core_concepts': words,
	'semantic_variations': [],
	'synonym_expansions': [],
	'industry_specific': [],
	'market_focused': [target_market] if target_market else [],
	'combined_queries': [' '.join(words[:2])] if len(words) >= 2 else words
	},
	'query_metadata': {
	'total_queries': len(words) + (1 if target_market else 0),
	'confidence_score': 0.5,
	'processing_method': 'basic_fallback'
	}
	}

	def get_optimized_queries_for_platform(self, enhanced_data: Dict[str, Any], platform: str) -> List[str]:
	"""
	Get platform-optimized queries from enhanced data

	Args:
	enhanced_data: Result from enhance_query()
	platform: Target platform ('news', 'reddit', etc.)

	Returns:
	List of optimized queries for the specific platform
	"""
	queries = enhanced_data.get('enhanced_queries', {})

	platform_strategies = {
	'news': ['core_concepts', 'industry_specific', 'combined_queries'],
	'reddit': ['semantic_variations', 'market_focused', 'core_concepts'],
	'linkedin': ['industry_specific', 'business_terms', 'market_focused']
	}

	strategy = platform_strategies.get(platform, ['core_concepts', 'combined_queries'])
	optimized_queries = []

	for strategy_key in strategy:
	if strategy_key in queries:
	query_list = queries[strategy_key]
	if isinstance(query_list, list):
	optimized_queries.extend(query_list)
	else:
	optimized_queries.append(str(query_list))

	# Remove duplicates while preserving order
	seen = set()
	unique_queries = []
	for query in optimized_queries:
	if query not in seen:
	seen.add(query)
	unique_queries.append(query)

	return unique_queries[:8] # Limit to 8 queries per platform


	# Example usage and testing
	async def test_nlp_enhancer():
	"""Test the NLP query enhancer"""
	enhancer = NLPQueryEnhancer()

	# Test query enhancement
	feature = "AI-powered voice ordering system for restaurants"
	market = "small to medium restaurants, food service industry"

	print("Testing NLP Query Enhancement...")
	enhanced = await enhancer.enhance_query(feature, market)

	print(f"Original: {enhanced['original_description']}")
	print(f"Market: {enhanced['target_market']}")
	print(f"Core concepts: {enhanced['extracted_concepts']['core_terms']}")
	print(f"Confidence: {enhanced['query_metadata']['confidence_score']}")

	# Test platform-specific queries
	for platform in ['news', 'reddit']:
	queries = enhancer.get_optimized_queries_for_platform(enhanced, platform)
	print(f"{platform.title()} queries: {queries[:3]}")

	return enhanced

	if __name__ == "__main__":
	asyncio.run(test_nlp_enhancer())