File size: 25,810 Bytes
72f802a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 |
"""
NLP Query Enhancer
Advanced query processing using spaCy and NLTK for better search results
"""
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Set, Tuple, Any
import re
import logging
from collections import defaultdict, Counter
import asyncio
import aiohttp
logger = logging.getLogger(__name__)
class NLPQueryEnhancer:
"""Enhanced query processing using advanced NLP techniques"""
def __init__(self):
"""Initialize NLP models and resources"""
self.nlp = None
self.sentence_model = None
self.lemmatizer = None
self.stop_words = None
self._initialized = False
# Industry-specific term mappings
self.industry_ontology = {
'technology': {
'ai': ['artificial intelligence', 'machine learning', 'deep learning', 'neural networks'],
'app': ['application', 'software', 'mobile app', 'web app', 'platform'],
'api': ['application programming interface', 'web service', 'endpoint'],
'saas': ['software as a service', 'cloud software', 'subscription software'],
'iot': ['internet of things', 'connected devices', 'smart devices'],
'blockchain': ['distributed ledger', 'cryptocurrency', 'smart contracts']
},
'business': {
'startup': ['new business', 'entrepreneur', 'venture', 'company'],
'revenue': ['income', 'earnings', 'sales', 'profit'],
'customer': ['client', 'user', 'consumer', 'buyer'],
'market': ['industry', 'sector', 'segment', 'niche'],
'competition': ['competitor', 'rival', 'alternative', 'substitute']
},
'health': {
'fitness': ['exercise', 'workout', 'training', 'physical activity'],
'nutrition': ['diet', 'food', 'eating', 'meal planning'],
'wellness': ['health', 'wellbeing', 'lifestyle', 'self-care'],
'medical': ['healthcare', 'clinical', 'treatment', 'diagnosis']
},
'finance': {
'fintech': ['financial technology', 'digital banking', 'payment systems'],
'investment': ['portfolio', 'trading', 'stocks', 'assets'],
'banking': ['financial services', 'credit', 'loans', 'deposits'],
'insurance': ['coverage', 'policy', 'claims', 'risk management']
}
}
async def initialize(self):
"""Initialize NLP models asynchronously"""
if self._initialized:
return
try:
logger.info("Initializing NLP models...")
# Download required NLTK data
await self._download_nltk_data()
# Initialize spaCy model
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
logger.warning("spaCy model 'en_core_web_sm' not found. Using basic tokenization.")
self.nlp = None
# Initialize sentence transformer
try:
self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
logger.warning(f"Could not load sentence transformer: {e}")
self.sentence_model = None
# Initialize NLTK components
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
self._initialized = True
logger.info("NLP models initialized successfully")
except Exception as e:
logger.error(f"Error initializing NLP models: {e}")
# Set basic fallbacks
self.lemmatizer = WordNetLemmatizer() if 'WordNetLemmatizer' in locals() else None
self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
async def _download_nltk_data(self):
"""Download required NLTK data"""
required_data = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4']
for data in required_data:
try:
nltk.data.find(f'tokenizers/{data}')
except LookupError:
try:
nltk.download(data, quiet=True)
except Exception as e:
logger.warning(f"Could not download NLTK data '{data}': {e}")
async def enhance_query(self, feature_description: str, target_market: str = "") -> Dict[str, Any]:
"""
Enhance a feature description into multiple optimized search queries
Args:
feature_description: Original feature description
target_market: Target market description
Returns:
Enhanced query data with multiple search strategies
"""
await self.initialize()
try:
# Extract key concepts using NLP
concepts = await self._extract_key_concepts(feature_description)
# Generate semantic variations
semantic_queries = await self._generate_semantic_queries(feature_description)
# Expand with synonyms and related terms
expanded_queries = await self._expand_with_synonyms(concepts)
# Add industry-specific terms
industry_queries = await self._add_industry_terms(concepts, target_market)
# Generate market-specific queries
market_queries = await self._generate_market_queries(feature_description, target_market)
# Combine and rank queries
all_queries = {
'core_concepts': concepts['core_terms'],
'semantic_variations': semantic_queries,
'synonym_expansions': expanded_queries,
'industry_specific': industry_queries,
'market_focused': market_queries,
'combined_queries': self._combine_queries(concepts, target_market)
}
return {
'original_description': feature_description,
'target_market': target_market,
'extracted_concepts': concepts,
'enhanced_queries': all_queries,
'query_metadata': {
'total_queries': sum(len(v) if isinstance(v, list) else 1 for v in all_queries.values()),
'confidence_score': self._calculate_confidence(concepts),
'processing_method': 'advanced_nlp' if self.nlp else 'basic_processing'
}
}
except Exception as e:
logger.error(f"Error enhancing query: {e}")
# Fallback to basic processing
return await self._basic_query_enhancement(feature_description, target_market)
async def _extract_key_concepts(self, text: str) -> Dict[str, Any]:
"""Extract key concepts using spaCy NLP"""
concepts = {
'core_terms': [],
'entities': [],
'technologies': [],
'business_terms': [],
'action_verbs': [],
'descriptors': []
}
if self.nlp:
# Use spaCy for advanced processing
doc = self.nlp(text)
# Extract named entities
for ent in doc.ents:
if ent.label_ in ['ORG', 'PRODUCT', 'TECHNOLOGY']:
concepts['entities'].append(ent.text.lower())
# Extract key terms by POS tags
for token in doc:
if token.is_stop or token.is_punct or len(token.text) < 3:
continue
lemma = token.lemma_.lower()
if token.pos_ == 'NOUN' and token.dep_ in ['nsubj', 'dobj', 'pobj']:
concepts['core_terms'].append(lemma)
elif token.pos_ == 'VERB' and token.tag_ not in ['MD', 'VBZ']: # Not auxiliary verbs
concepts['action_verbs'].append(lemma)
elif token.pos_ == 'ADJ':
concepts['descriptors'].append(lemma)
# Extract noun phrases
for chunk in doc.noun_chunks:
if len(chunk.text.split()) <= 3: # Keep phrases short
concepts['core_terms'].append(chunk.text.lower())
else:
# Fallback to basic processing
words = word_tokenize(text.lower())
if self.lemmatizer:
words = [self.lemmatizer.lemmatize(word) for word in words if word.isalpha()]
# Filter out stop words
words = [word for word in words if word not in self.stop_words and len(word) > 2]
concepts['core_terms'] = words[:10] # Limit to top 10
# Categorize terms
concepts = self._categorize_terms(concepts)
# Remove duplicates and sort by importance
for key in concepts:
if isinstance(concepts[key], list):
concepts[key] = list(dict.fromkeys(concepts[key]))[:5] # Top 5 per category
return concepts
def _categorize_terms(self, concepts: Dict[str, Any]) -> Dict[str, Any]:
"""Categorize terms into technology, business, etc."""
tech_keywords = {'ai', 'app', 'software', 'platform', 'api', 'system', 'tool', 'technology', 'digital', 'online', 'mobile', 'web'}
business_keywords = {'market', 'customer', 'business', 'service', 'product', 'solution', 'company', 'industry', 'revenue', 'profit'}
all_terms = concepts['core_terms'] + concepts['entities']
for term in all_terms:
term_lower = term.lower()
if any(tech in term_lower for tech in tech_keywords):
concepts['technologies'].append(term)
elif any(biz in term_lower for biz in business_keywords):
concepts['business_terms'].append(term)
return concepts
async def _generate_semantic_queries(self, text: str) -> List[str]:
"""Generate semantically similar queries using sentence transformers"""
if not self.sentence_model:
return []
try:
# Generate variations by paraphrasing key concepts
base_embedding = self.sentence_model.encode([text])
# Create variations by replacing key terms with synonyms
variations = []
words = text.split()
for i, word in enumerate(words):
if len(word) > 4 and word.lower() not in self.stop_words:
# Try to find synonyms
synonyms = self._get_wordnet_synonyms(word)
for synonym in synonyms[:2]: # Limit to 2 synonyms per word
new_text = words.copy()
new_text[i] = synonym
variations.append(' '.join(new_text))
return variations[:5] # Return top 5 variations
except Exception as e:
logger.warning(f"Error generating semantic queries: {e}")
return []
async def _expand_with_synonyms(self, concepts: Dict[str, Any]) -> List[str]:
"""Expand core terms with synonyms using WordNet"""
expanded = []
for term in concepts['core_terms'][:5]: # Limit to top 5 terms
synonyms = self._get_wordnet_synonyms(term)
if synonyms:
# Create queries with synonyms
expanded.extend(synonyms[:3]) # Top 3 synonyms per term
return list(set(expanded)) # Remove duplicates
def _get_wordnet_synonyms(self, word: str) -> List[str]:
"""Get synonyms from WordNet"""
synonyms = set()
try:
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonym = lemma.name().replace('_', ' ')
if synonym.lower() != word.lower() and len(synonym) > 2:
synonyms.add(synonym)
except Exception as e:
logger.debug(f"Error getting synonyms for '{word}': {e}")
return list(synonyms)[:5] # Limit to 5 synonyms
async def _add_industry_terms(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
"""Add industry-specific terminology"""
industry_queries = []
# Detect industry from concepts and target market
detected_industries = self._detect_industries(concepts, target_market)
for industry in detected_industries:
if industry in self.industry_ontology:
ontology = self.industry_ontology[industry]
# Match concepts to industry terms
for concept in concepts['core_terms'][:3]:
for term, expansions in ontology.items():
if concept.lower() in term or term in concept.lower():
industry_queries.extend(expansions[:2]) # Add 2 expansions
return list(set(industry_queries))[:8] # Limit to 8 industry terms
def _detect_industries(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
"""Detect relevant industries from concepts and target market"""
industries = []
all_text = ' '.join(concepts['core_terms'] + [target_market]).lower()
# Simple keyword matching for industry detection
industry_keywords = {
'technology': ['app', 'software', 'ai', 'tech', 'digital', 'platform', 'api'],
'health': ['fitness', 'health', 'medical', 'wellness', 'nutrition', 'exercise'],
'business': ['business', 'startup', 'company', 'market', 'customer'],
'finance': ['finance', 'money', 'payment', 'banking', 'investment']
}
for industry, keywords in industry_keywords.items():
if any(keyword in all_text for keyword in keywords):
industries.append(industry)
return industries[:2] # Limit to 2 industries
async def _generate_market_queries(self, feature_description: str, target_market: str) -> List[str]:
"""Generate market-focused search queries"""
if not target_market:
return []
market_queries = []
# Extract key terms from feature description
feature_words = [word for word in feature_description.split()
if len(word) > 3 and word.lower() not in self.stop_words][:3]
# Extract market segments
market_words = [word for word in target_market.split()
if len(word) > 3 and word.lower() not in self.stop_words][:3]
# Combine feature + market
for feature_word in feature_words:
for market_word in market_words:
market_queries.append(f"{feature_word} {market_word}")
market_queries.append(f"{market_word} {feature_word}")
# Add market-specific patterns
market_patterns = [
f"{target_market} solutions",
f"{target_market} tools",
f"{target_market} software",
f"{target_market} apps",
f"{target_market} technology"
]
market_queries.extend(market_patterns)
return list(set(market_queries))[:6] # Limit to 6 market queries
def _combine_queries(self, concepts: Dict[str, Any], target_market: str) -> List[str]:
"""Combine different concepts into comprehensive, contextually relevant queries"""
combined = []
core_terms = concepts['core_terms'][:3]
technologies = concepts['technologies'][:2]
business_terms = concepts['business_terms'][:2]
# Create contextual phrase combinations (preserve meaning)
if len(core_terms) >= 2:
# Keep related terms together
main_concept = ' '.join(core_terms[:2])
combined.append(main_concept)
# Add third term only if it's contextually related
if len(core_terms) >= 3:
extended_concept = ' '.join(core_terms[:3])
# Only add if it makes semantic sense
if len(extended_concept.split()) <= 4: # Avoid overly long phrases
combined.append(extended_concept)
# Combine technology + business terms meaningfully
for tech in technologies:
for biz in business_terms:
# Create meaningful combinations
tech_biz_combo = f"{tech} {biz}"
# Ensure the combination makes sense (not just random word pairs)
if self._is_meaningful_combination(tech, biz):
combined.append(tech_biz_combo)
# Add contextual market combinations
if target_market:
market_words = [word for word in target_market.split() if len(word) > 3][:2]
for market_word in market_words:
for term in core_terms[:2]:
# Create market-specific queries that maintain context
market_combo = f"{term} {market_word}"
if self._is_meaningful_combination(term, market_word):
combined.append(market_combo)
# Remove duplicates and filter for relevance
unique_combined = list(dict.fromkeys(combined)) # Preserve order while removing duplicates
# Filter out combinations that are too generic or meaningless
filtered_combined = [combo for combo in unique_combined if self._is_contextually_relevant(combo)]
return filtered_combined[:5] # Limit to 5 most relevant combined queries
def _is_meaningful_combination(self, term1: str, term2: str) -> bool:
"""Check if two terms create a meaningful combination"""
# Avoid combining very similar terms
if term1.lower() == term2.lower():
return False
# Avoid combining terms that are substrings of each other
if term1.lower() in term2.lower() or term2.lower() in term1.lower():
return False
# Check for semantic compatibility
tech_terms = {'ai', 'ml', 'algorithm', 'neural', 'deep', 'machine', 'learning', 'data', 'model', 'system', 'network', 'automation', 'software', 'digital', 'technology', 'computer', 'intelligence'}
business_terms = {'business', 'market', 'customer', 'service', 'product', 'solution', 'company', 'industry', 'enterprise', 'commercial', 'professional', 'management', 'strategy'}
term1_is_tech = any(tech in term1.lower() for tech in tech_terms)
term2_is_tech = any(tech in term2.lower() for tech in tech_terms)
term1_is_business = any(biz in term1.lower() for biz in business_terms)
term2_is_business = any(biz in term2.lower() for biz in business_terms)
# Good combinations: tech + business, or related terms
if (term1_is_tech and term2_is_business) or (term1_is_business and term2_is_tech):
return True
# Both are tech terms or both are business terms - can be good if not too similar
if (term1_is_tech and term2_is_tech) or (term1_is_business and term2_is_business):
return len(set(term1.lower().split()) & set(term2.lower().split())) == 0 # No overlapping words
return True # Default to allowing the combination
def _is_contextually_relevant(self, query: str) -> bool:
"""Check if a query maintains contextual relevance"""
words = query.lower().split()
# Filter out queries that are too short or too long
if len(words) < 1 or len(words) > 5:
return False
# Filter out queries with only stop words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
meaningful_words = [word for word in words if word not in stop_words and len(word) > 2]
if len(meaningful_words) == 0:
return False
# Ensure at least one word is substantial (length > 3)
if not any(len(word) > 3 for word in meaningful_words):
return False
return True
def _calculate_confidence(self, concepts: Dict[str, Any]) -> float:
"""Calculate confidence score for the extraction"""
total_concepts = sum(len(v) if isinstance(v, list) else 0 for v in concepts.values())
# Base confidence on number of extracted concepts
if total_concepts >= 15:
return 0.9
elif total_concepts >= 10:
return 0.8
elif total_concepts >= 5:
return 0.7
else:
return 0.6
async def _basic_query_enhancement(self, feature_description: str, target_market: str) -> Dict[str, Any]:
"""Fallback basic query enhancement when NLP models fail"""
# Simple word extraction
words = re.findall(r'\b\w{3,}\b', feature_description.lower())
words = [word for word in words if word not in self.stop_words][:5]
return {
'original_description': feature_description,
'target_market': target_market,
'extracted_concepts': {'core_terms': words},
'enhanced_queries': {
'core_concepts': words,
'semantic_variations': [],
'synonym_expansions': [],
'industry_specific': [],
'market_focused': [target_market] if target_market else [],
'combined_queries': [' '.join(words[:2])] if len(words) >= 2 else words
},
'query_metadata': {
'total_queries': len(words) + (1 if target_market else 0),
'confidence_score': 0.5,
'processing_method': 'basic_fallback'
}
}
def get_optimized_queries_for_platform(self, enhanced_data: Dict[str, Any], platform: str) -> List[str]:
"""
Get platform-optimized queries from enhanced data
Args:
enhanced_data: Result from enhance_query()
platform: Target platform ('news', 'reddit', etc.)
Returns:
List of optimized queries for the specific platform
"""
queries = enhanced_data.get('enhanced_queries', {})
platform_strategies = {
'news': ['core_concepts', 'industry_specific', 'combined_queries'],
'reddit': ['semantic_variations', 'market_focused', 'core_concepts'],
'linkedin': ['industry_specific', 'business_terms', 'market_focused']
}
strategy = platform_strategies.get(platform, ['core_concepts', 'combined_queries'])
optimized_queries = []
for strategy_key in strategy:
if strategy_key in queries:
query_list = queries[strategy_key]
if isinstance(query_list, list):
optimized_queries.extend(query_list)
else:
optimized_queries.append(str(query_list))
# Remove duplicates while preserving order
seen = set()
unique_queries = []
for query in optimized_queries:
if query not in seen:
seen.add(query)
unique_queries.append(query)
return unique_queries[:8] # Limit to 8 queries per platform
# Example usage and testing
async def test_nlp_enhancer():
"""Test the NLP query enhancer"""
enhancer = NLPQueryEnhancer()
# Test query enhancement
feature = "AI-powered voice ordering system for restaurants"
market = "small to medium restaurants, food service industry"
print("Testing NLP Query Enhancement...")
enhanced = await enhancer.enhance_query(feature, market)
print(f"Original: {enhanced['original_description']}")
print(f"Market: {enhanced['target_market']}")
print(f"Core concepts: {enhanced['extracted_concepts']['core_terms']}")
print(f"Confidence: {enhanced['query_metadata']['confidence_score']}")
# Test platform-specific queries
for platform in ['news', 'reddit']:
queries = enhancer.get_optimized_queries_for_platform(enhanced, platform)
print(f"{platform.title()} queries: {queries[:3]}")
return enhanced
if __name__ == "__main__":
asyncio.run(test_nlp_enhancer())
|