Spaces:
Sleeping
Sleeping
updating logging
Browse files
mediaunmasked/analyzers/bias_analyzer.py
CHANGED
|
@@ -140,27 +140,42 @@ class BiasAnalyzer:
|
|
| 140 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 141 |
"""Analyze bias using LLM zero-shot classification with batch processing."""
|
| 142 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
# Define bias categories
|
| 144 |
bias_categories = [
|
| 145 |
"left-wing bias",
|
| 146 |
"right-wing bias",
|
| 147 |
"neutral/balanced perspective"
|
| 148 |
]
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
# Clean and prepare text
|
|
|
|
| 151 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 152 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 153 |
if not line.startswith('[') and not line.startswith('More on'))
|
|
|
|
| 154 |
|
| 155 |
# Split into larger chunks (4000 chars) for fewer API calls
|
| 156 |
chunks = [cleaned_text[i:i+4000] for i in range(0, len(cleaned_text), 4000)]
|
|
|
|
| 157 |
|
| 158 |
# Process chunks in batches
|
| 159 |
chunk_scores = []
|
| 160 |
flagged_phrases = []
|
| 161 |
|
| 162 |
-
for chunk in chunks:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
# Analyze chunk as a whole first
|
|
|
|
| 164 |
chunk_result = self.classifier(
|
| 165 |
chunk,
|
| 166 |
bias_categories,
|
|
@@ -172,16 +187,24 @@ class BiasAnalyzer:
|
|
| 172 |
for label, score in zip(chunk_result['labels'], chunk_result['scores'])
|
| 173 |
})
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
# Only analyze individual sentences if chunk shows strong bias
|
| 176 |
max_chunk_score = max(chunk_result['scores'])
|
| 177 |
if max_chunk_score > 0.6:
|
|
|
|
| 178 |
sentences = sent_tokenize(chunk)
|
|
|
|
|
|
|
| 179 |
# Filter sentences for analysis (longer, potentially more meaningful ones)
|
| 180 |
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
|
|
|
| 181 |
|
| 182 |
# Process sentences in batches of 8
|
| 183 |
-
for
|
| 184 |
-
batch = relevant_sentences[
|
| 185 |
try:
|
| 186 |
batch_results = self.classifier(
|
| 187 |
batch,
|
|
@@ -196,6 +219,8 @@ class BiasAnalyzer:
|
|
| 196 |
for sentence, result in zip(batch, batch_results):
|
| 197 |
max_score = max(result['scores'])
|
| 198 |
if max_score > 0.8 and result['labels'][0] != "neutral/balanced perspective":
|
|
|
|
|
|
|
| 199 |
flagged_phrases.append({
|
| 200 |
"text": sentence,
|
| 201 |
"type": result['labels'][0],
|
|
@@ -208,6 +233,7 @@ class BiasAnalyzer:
|
|
| 208 |
continue
|
| 209 |
|
| 210 |
# Aggregate scores across chunks
|
|
|
|
| 211 |
aggregated_scores = {
|
| 212 |
category: np.mean([
|
| 213 |
scores[category]
|
|
@@ -216,6 +242,10 @@ class BiasAnalyzer:
|
|
| 216 |
for category in bias_categories
|
| 217 |
}
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# Calculate bias metrics
|
| 220 |
left_score = aggregated_scores["left-wing bias"]
|
| 221 |
right_score = aggregated_scores["right-wing bias"]
|
|
@@ -223,6 +253,7 @@ class BiasAnalyzer:
|
|
| 223 |
|
| 224 |
# Calculate bias score (-1 to 1)
|
| 225 |
bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
|
|
|
|
| 226 |
|
| 227 |
# Determine bias label
|
| 228 |
if bias_score < -0.6:
|
|
@@ -240,8 +271,11 @@ class BiasAnalyzer:
|
|
| 240 |
else:
|
| 241 |
bias = "Neutral"
|
| 242 |
|
|
|
|
|
|
|
| 243 |
# Calculate bias percentage (0-100)
|
| 244 |
bias_percentage = min(100, abs(bias_score * 100))
|
|
|
|
| 245 |
|
| 246 |
# Sort and limit flagged phrases
|
| 247 |
sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
|
|
@@ -255,6 +289,10 @@ class BiasAnalyzer:
|
|
| 255 |
if len(unique_phrases) >= 5:
|
| 256 |
break
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
return {
|
| 259 |
"bias": bias,
|
| 260 |
"bias_score": round(bias_score, 2),
|
|
|
|
| 140 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 141 |
"""Analyze bias using LLM zero-shot classification with batch processing."""
|
| 142 |
try:
|
| 143 |
+
logger.info("\n" + "="*50)
|
| 144 |
+
logger.info("BIAS ANALYSIS STARTED")
|
| 145 |
+
logger.info("="*50)
|
| 146 |
+
|
| 147 |
# Define bias categories
|
| 148 |
bias_categories = [
|
| 149 |
"left-wing bias",
|
| 150 |
"right-wing bias",
|
| 151 |
"neutral/balanced perspective"
|
| 152 |
]
|
| 153 |
+
logger.info("Using categories for analysis:")
|
| 154 |
+
for cat in bias_categories:
|
| 155 |
+
logger.info(f" - {cat}")
|
| 156 |
|
| 157 |
# Clean and prepare text
|
| 158 |
+
logger.info("\nCleaning and preparing text...")
|
| 159 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 160 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 161 |
if not line.startswith('[') and not line.startswith('More on'))
|
| 162 |
+
logger.info(f"Text prepared - Length: {len(cleaned_text)} characters")
|
| 163 |
|
| 164 |
# Split into larger chunks (4000 chars) for fewer API calls
|
| 165 |
chunks = [cleaned_text[i:i+4000] for i in range(0, len(cleaned_text), 4000)]
|
| 166 |
+
logger.info(f"Split text into {len(chunks)} chunks for processing")
|
| 167 |
|
| 168 |
# Process chunks in batches
|
| 169 |
chunk_scores = []
|
| 170 |
flagged_phrases = []
|
| 171 |
|
| 172 |
+
for i, chunk in enumerate(chunks, 1):
|
| 173 |
+
logger.info(f"\n{'-'*30}")
|
| 174 |
+
logger.info(f"Processing chunk {i}/{len(chunks)}")
|
| 175 |
+
logger.info(f"Chunk length: {len(chunk)} characters")
|
| 176 |
+
|
| 177 |
# Analyze chunk as a whole first
|
| 178 |
+
logger.info("Analyzing chunk for overall bias...")
|
| 179 |
chunk_result = self.classifier(
|
| 180 |
chunk,
|
| 181 |
bias_categories,
|
|
|
|
| 187 |
for label, score in zip(chunk_result['labels'], chunk_result['scores'])
|
| 188 |
})
|
| 189 |
|
| 190 |
+
logger.info("Chunk bias scores:")
|
| 191 |
+
for label, score in chunk_scores[-1].items():
|
| 192 |
+
logger.info(f" - {label}: {score:.3f}")
|
| 193 |
+
|
| 194 |
# Only analyze individual sentences if chunk shows strong bias
|
| 195 |
max_chunk_score = max(chunk_result['scores'])
|
| 196 |
if max_chunk_score > 0.6:
|
| 197 |
+
logger.info(f"Strong bias detected (score: {max_chunk_score:.3f}), analyzing individual sentences...")
|
| 198 |
sentences = sent_tokenize(chunk)
|
| 199 |
+
logger.info(f"Found {len(sentences)} sentences to analyze")
|
| 200 |
+
|
| 201 |
# Filter sentences for analysis (longer, potentially more meaningful ones)
|
| 202 |
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 203 |
+
logger.info(f"Filtered to {len(relevant_sentences)} relevant sentences")
|
| 204 |
|
| 205 |
# Process sentences in batches of 8
|
| 206 |
+
for j in range(0, len(relevant_sentences), 8):
|
| 207 |
+
batch = relevant_sentences[j:j+8]
|
| 208 |
try:
|
| 209 |
batch_results = self.classifier(
|
| 210 |
batch,
|
|
|
|
| 219 |
for sentence, result in zip(batch, batch_results):
|
| 220 |
max_score = max(result['scores'])
|
| 221 |
if max_score > 0.8 and result['labels'][0] != "neutral/balanced perspective":
|
| 222 |
+
logger.info(f"Found biased sentence (score: {max_score:.3f}, type: {result['labels'][0]}):")
|
| 223 |
+
logger.info(f" \"{sentence}\"")
|
| 224 |
flagged_phrases.append({
|
| 225 |
"text": sentence,
|
| 226 |
"type": result['labels'][0],
|
|
|
|
| 233 |
continue
|
| 234 |
|
| 235 |
# Aggregate scores across chunks
|
| 236 |
+
logger.info("\nAggregating scores across all chunks...")
|
| 237 |
aggregated_scores = {
|
| 238 |
category: np.mean([
|
| 239 |
scores[category]
|
|
|
|
| 242 |
for category in bias_categories
|
| 243 |
}
|
| 244 |
|
| 245 |
+
logger.info("\nFinal aggregated scores:")
|
| 246 |
+
for category, score in aggregated_scores.items():
|
| 247 |
+
logger.info(f" - {category}: {score:.3f}")
|
| 248 |
+
|
| 249 |
# Calculate bias metrics
|
| 250 |
left_score = aggregated_scores["left-wing bias"]
|
| 251 |
right_score = aggregated_scores["right-wing bias"]
|
|
|
|
| 253 |
|
| 254 |
# Calculate bias score (-1 to 1)
|
| 255 |
bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
|
| 256 |
+
logger.info(f"\nRaw bias score: {bias_score:.3f}")
|
| 257 |
|
| 258 |
# Determine bias label
|
| 259 |
if bias_score < -0.6:
|
|
|
|
| 271 |
else:
|
| 272 |
bias = "Neutral"
|
| 273 |
|
| 274 |
+
logger.info(f"Determined bias label: {bias}")
|
| 275 |
+
|
| 276 |
# Calculate bias percentage (0-100)
|
| 277 |
bias_percentage = min(100, abs(bias_score * 100))
|
| 278 |
+
logger.info(f"Bias percentage: {bias_percentage:.1f}%")
|
| 279 |
|
| 280 |
# Sort and limit flagged phrases
|
| 281 |
sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
|
|
|
|
| 289 |
if len(unique_phrases) >= 5:
|
| 290 |
break
|
| 291 |
|
| 292 |
+
logger.info(f"\nFlagged {len(unique_phrases)} unique biased phrases")
|
| 293 |
+
|
| 294 |
+
logger.info("\nBias analysis completed successfully")
|
| 295 |
+
|
| 296 |
return {
|
| 297 |
"bias": bias,
|
| 298 |
"bias_score": round(bias_score, 2),
|
mediaunmasked/analyzers/evidence_analyzer.py
CHANGED
|
@@ -71,19 +71,27 @@ class EvidenceAnalyzer:
|
|
| 71 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 72 |
"""Analyze evidence using LLM."""
|
| 73 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
# Clean the text of formatting markers
|
|
|
|
| 75 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 76 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 77 |
if not line.startswith('[') and not line.startswith('More on'))
|
|
|
|
| 78 |
|
| 79 |
# Download NLTK data if needed
|
| 80 |
try:
|
| 81 |
nltk.data.find('tokenizers/punkt')
|
| 82 |
except LookupError:
|
|
|
|
| 83 |
nltk.download('punkt')
|
| 84 |
|
| 85 |
# Split text into chunks
|
| 86 |
chunks = [cleaned_text[i:i+2000] for i in range(0, len(cleaned_text), 2000)]
|
|
|
|
| 87 |
|
| 88 |
# Categories for evidence classification
|
| 89 |
evidence_categories = [
|
|
@@ -95,15 +103,28 @@ class EvidenceAnalyzer:
|
|
| 95 |
"opinion statement"
|
| 96 |
]
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
chunk_scores = []
|
| 99 |
flagged_phrases = []
|
| 100 |
|
| 101 |
-
for chunk in chunks:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# Analyze each sentence in the chunk
|
| 103 |
sentences = sent_tokenize(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
for sentence in sentences:
|
| 106 |
if len(sentence.strip()) > 10:
|
|
|
|
| 107 |
# Classify the type of evidence
|
| 108 |
result = self.classifier(
|
| 109 |
sentence.strip(),
|
|
@@ -141,17 +162,28 @@ class EvidenceAnalyzer:
|
|
| 141 |
marker in sentence.lower()
|
| 142 |
for marker in ['more on this story', 'click here', 'read more']
|
| 143 |
):
|
|
|
|
|
|
|
|
|
|
| 144 |
flagged_phrases.append({
|
| 145 |
'text': sentence.strip(),
|
| 146 |
'type': 'strong_evidence',
|
| 147 |
'score': strong_evidence
|
| 148 |
})
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
# Calculate overall evidence score
|
|
|
|
| 151 |
if chunk_scores:
|
| 152 |
avg_strong = np.mean([s['strong_evidence'] for s in chunk_scores])
|
| 153 |
avg_weak = np.mean([s['weak_evidence'] for s in chunk_scores])
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
# Evidence score formula:
|
| 156 |
# - Reward strong evidence (70% weight)
|
| 157 |
# - Penalize weak/unsubstantiated claims (30% weight)
|
|
@@ -162,6 +194,9 @@ class EvidenceAnalyzer:
|
|
| 162 |
) * 100)
|
| 163 |
else:
|
| 164 |
evidence_score = 0
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
# Sort and select top evidence phrases
|
| 167 |
sorted_phrases = sorted(
|
|
@@ -169,6 +204,7 @@ class EvidenceAnalyzer:
|
|
| 169 |
key=lambda x: x['score'],
|
| 170 |
reverse=True
|
| 171 |
)
|
|
|
|
| 172 |
# Filter out formatting text and duplicates
|
| 173 |
unique_phrases = []
|
| 174 |
seen = set()
|
|
@@ -183,6 +219,10 @@ class EvidenceAnalyzer:
|
|
| 183 |
if len(unique_phrases) >= 5:
|
| 184 |
break
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return {
|
| 187 |
"evidence_based_score": round(evidence_score, 1),
|
| 188 |
"flagged_phrases": unique_phrases
|
|
|
|
| 71 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 72 |
"""Analyze evidence using LLM."""
|
| 73 |
try:
|
| 74 |
+
logger.info("\n" + "="*50)
|
| 75 |
+
logger.info("EVIDENCE ANALYSIS STARTED")
|
| 76 |
+
logger.info("="*50)
|
| 77 |
+
|
| 78 |
# Clean the text of formatting markers
|
| 79 |
+
logger.info("Cleaning and preparing text...")
|
| 80 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 81 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 82 |
if not line.startswith('[') and not line.startswith('More on'))
|
| 83 |
+
logger.info(f"Text prepared - Length: {len(cleaned_text)} characters")
|
| 84 |
|
| 85 |
# Download NLTK data if needed
|
| 86 |
try:
|
| 87 |
nltk.data.find('tokenizers/punkt')
|
| 88 |
except LookupError:
|
| 89 |
+
logger.info("Downloading required NLTK data...")
|
| 90 |
nltk.download('punkt')
|
| 91 |
|
| 92 |
# Split text into chunks
|
| 93 |
chunks = [cleaned_text[i:i+2000] for i in range(0, len(cleaned_text), 2000)]
|
| 94 |
+
logger.info(f"Split text into {len(chunks)} chunks for processing")
|
| 95 |
|
| 96 |
# Categories for evidence classification
|
| 97 |
evidence_categories = [
|
|
|
|
| 103 |
"opinion statement"
|
| 104 |
]
|
| 105 |
|
| 106 |
+
logger.info("\nUsing evidence categories:")
|
| 107 |
+
for cat in evidence_categories:
|
| 108 |
+
logger.info(f" - {cat}")
|
| 109 |
+
|
| 110 |
chunk_scores = []
|
| 111 |
flagged_phrases = []
|
| 112 |
|
| 113 |
+
for i, chunk in enumerate(chunks, 1):
|
| 114 |
+
logger.info(f"\n{'-'*30}")
|
| 115 |
+
logger.info(f"Processing chunk {i}/{len(chunks)}")
|
| 116 |
+
logger.info(f"Chunk length: {len(chunk)} characters")
|
| 117 |
+
|
| 118 |
# Analyze each sentence in the chunk
|
| 119 |
sentences = sent_tokenize(chunk)
|
| 120 |
+
logger.info(f"Found {len(sentences)} sentences to analyze")
|
| 121 |
+
|
| 122 |
+
sentence_count = 0
|
| 123 |
+
strong_evidence_count = 0
|
| 124 |
|
| 125 |
for sentence in sentences:
|
| 126 |
if len(sentence.strip()) > 10:
|
| 127 |
+
sentence_count += 1
|
| 128 |
# Classify the type of evidence
|
| 129 |
result = self.classifier(
|
| 130 |
sentence.strip(),
|
|
|
|
| 162 |
marker in sentence.lower()
|
| 163 |
for marker in ['more on this story', 'click here', 'read more']
|
| 164 |
):
|
| 165 |
+
strong_evidence_count += 1
|
| 166 |
+
logger.info(f"Found strong evidence (score: {strong_evidence:.3f}):")
|
| 167 |
+
logger.info(f" \"{sentence.strip()}\"")
|
| 168 |
flagged_phrases.append({
|
| 169 |
'text': sentence.strip(),
|
| 170 |
'type': 'strong_evidence',
|
| 171 |
'score': strong_evidence
|
| 172 |
})
|
| 173 |
+
|
| 174 |
+
logger.info(f"Processed {sentence_count} sentences in chunk {i}")
|
| 175 |
+
logger.info(f"Found {strong_evidence_count} sentences with strong evidence")
|
| 176 |
|
| 177 |
# Calculate overall evidence score
|
| 178 |
+
logger.info("\nCalculating final evidence scores...")
|
| 179 |
if chunk_scores:
|
| 180 |
avg_strong = np.mean([s['strong_evidence'] for s in chunk_scores])
|
| 181 |
avg_weak = np.mean([s['weak_evidence'] for s in chunk_scores])
|
| 182 |
|
| 183 |
+
logger.info("Average evidence scores:")
|
| 184 |
+
logger.info(f" - Strong evidence: {avg_strong:.3f}")
|
| 185 |
+
logger.info(f" - Weak evidence: {avg_weak:.3f}")
|
| 186 |
+
|
| 187 |
# Evidence score formula:
|
| 188 |
# - Reward strong evidence (70% weight)
|
| 189 |
# - Penalize weak/unsubstantiated claims (30% weight)
|
|
|
|
| 194 |
) * 100)
|
| 195 |
else:
|
| 196 |
evidence_score = 0
|
| 197 |
+
logger.warning("No scores available, defaulting to 0")
|
| 198 |
+
|
| 199 |
+
logger.info(f"Final evidence score: {evidence_score:.1f}")
|
| 200 |
|
| 201 |
# Sort and select top evidence phrases
|
| 202 |
sorted_phrases = sorted(
|
|
|
|
| 204 |
key=lambda x: x['score'],
|
| 205 |
reverse=True
|
| 206 |
)
|
| 207 |
+
|
| 208 |
# Filter out formatting text and duplicates
|
| 209 |
unique_phrases = []
|
| 210 |
seen = set()
|
|
|
|
| 219 |
if len(unique_phrases) >= 5:
|
| 220 |
break
|
| 221 |
|
| 222 |
+
logger.info(f"\nFlagged {len(unique_phrases)} unique evidence-based phrases")
|
| 223 |
+
|
| 224 |
+
logger.info("\nEvidence analysis completed successfully")
|
| 225 |
+
|
| 226 |
return {
|
| 227 |
"evidence_based_score": round(evidence_score, 1),
|
| 228 |
"flagged_phrases": unique_phrases
|
mediaunmasked/analyzers/headline_analyzer.py
CHANGED
|
@@ -82,6 +82,12 @@ class HeadlineAnalyzer:
|
|
| 82 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, Any]:
|
| 83 |
"""Analyze a single section for headline accuracy and sensationalism."""
|
| 84 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# Download NLTK data if needed
|
| 86 |
try:
|
| 87 |
nltk.data.find('tokenizers/punkt')
|
|
@@ -89,10 +95,12 @@ class HeadlineAnalyzer:
|
|
| 89 |
nltk.download('punkt')
|
| 90 |
|
| 91 |
sentences = sent_tokenize(section)
|
|
|
|
|
|
|
| 92 |
if not sentences:
|
| 93 |
logger.warning("No sentences found in section")
|
| 94 |
return {
|
| 95 |
-
"accuracy_score": 50.0,
|
| 96 |
"flagged_phrases": [],
|
| 97 |
"detailed_scores": {
|
| 98 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
|
@@ -109,7 +117,7 @@ class HeadlineAnalyzer:
|
|
| 109 |
"accurate headline"
|
| 110 |
]
|
| 111 |
|
| 112 |
-
|
| 113 |
sensationalism_result = self.zero_shot(
|
| 114 |
headline,
|
| 115 |
sensationalism_categories,
|
|
@@ -120,14 +128,16 @@ class HeadlineAnalyzer:
|
|
| 120 |
label: score
|
| 121 |
for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
|
| 122 |
}
|
|
|
|
| 123 |
|
| 124 |
# Filter relevant sentences (longer than 20 chars)
|
| 125 |
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
|
|
|
| 126 |
|
| 127 |
if not relevant_sentences:
|
| 128 |
logger.warning("No relevant sentences found in section")
|
| 129 |
return {
|
| 130 |
-
"accuracy_score": 50.0,
|
| 131 |
"flagged_phrases": [],
|
| 132 |
"detailed_scores": {
|
| 133 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
|
@@ -140,6 +150,7 @@ class HeadlineAnalyzer:
|
|
| 140 |
flagged_phrases = []
|
| 141 |
batch_size = 8
|
| 142 |
|
|
|
|
| 143 |
for i in range(0, len(relevant_sentences), batch_size):
|
| 144 |
batch = relevant_sentences[i:i+batch_size]
|
| 145 |
batch_inputs = [f"{headline} [SEP] {sentence}" for sentence in batch]
|
|
@@ -154,14 +165,25 @@ class HeadlineAnalyzer:
|
|
| 154 |
scores = {item['label']: item['score'] for item in result}
|
| 155 |
nli_scores.append(scores)
|
| 156 |
|
| 157 |
-
# Flag contradictory content
|
| 158 |
-
if scores.get('CONTRADICTION', 0) > 0.
|
|
|
|
| 159 |
flagged_phrases.append({
|
| 160 |
'text': sentence,
|
| 161 |
'type': 'Contradiction',
|
| 162 |
'score': scores['CONTRADICTION'],
|
| 163 |
'highlight': f"[CONTRADICTION] (Score: {round(scores['CONTRADICTION'] * 100, 1)}%) \"{sentence}\""
|
| 164 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
except Exception as batch_error:
|
| 167 |
logger.warning(f"Batch processing error: {str(batch_error)}")
|
|
@@ -180,6 +202,7 @@ class HeadlineAnalyzer:
|
|
| 180 |
]))
|
| 181 |
for label in ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']
|
| 182 |
}
|
|
|
|
| 183 |
except Exception as agg_error:
|
| 184 |
logger.error(f"Error aggregating NLI scores: {str(agg_error)}")
|
| 185 |
avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
|
|
@@ -199,6 +222,7 @@ class HeadlineAnalyzer:
|
|
| 199 |
) * 0.15
|
| 200 |
}
|
| 201 |
|
|
|
|
| 202 |
accuracy_score = sum(accuracy_components.values()) * 100
|
| 203 |
|
| 204 |
# Validate final score
|
|
@@ -207,6 +231,7 @@ class HeadlineAnalyzer:
|
|
| 207 |
accuracy_score = 50.0
|
| 208 |
else:
|
| 209 |
accuracy_score = float(accuracy_score)
|
|
|
|
| 210 |
|
| 211 |
except Exception as score_error:
|
| 212 |
logger.error(f"Error calculating accuracy score: {str(score_error)}")
|
|
@@ -228,6 +253,8 @@ class HeadlineAnalyzer:
|
|
| 228 |
if len(unique_phrases) >= 5:
|
| 229 |
break
|
| 230 |
|
|
|
|
|
|
|
| 231 |
return {
|
| 232 |
"accuracy_score": accuracy_score,
|
| 233 |
"flagged_phrases": unique_phrases,
|
|
@@ -240,7 +267,7 @@ class HeadlineAnalyzer:
|
|
| 240 |
except Exception as e:
|
| 241 |
logger.error(f"Section analysis failed: {str(e)}")
|
| 242 |
return {
|
| 243 |
-
"accuracy_score": 50.0,
|
| 244 |
"flagged_phrases": [],
|
| 245 |
"detailed_scores": {
|
| 246 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
|
|
|
| 82 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, Any]:
|
| 83 |
"""Analyze a single section for headline accuracy and sensationalism."""
|
| 84 |
try:
|
| 85 |
+
logger.info("\n" + "-"*30)
|
| 86 |
+
logger.info("ANALYZING SECTION")
|
| 87 |
+
logger.info("-"*30)
|
| 88 |
+
logger.info(f"Headline: {headline}")
|
| 89 |
+
logger.info(f"Section length: {len(section)} characters")
|
| 90 |
+
|
| 91 |
# Download NLTK data if needed
|
| 92 |
try:
|
| 93 |
nltk.data.find('tokenizers/punkt')
|
|
|
|
| 95 |
nltk.download('punkt')
|
| 96 |
|
| 97 |
sentences = sent_tokenize(section)
|
| 98 |
+
logger.info(f"Found {len(sentences)} sentences in section")
|
| 99 |
+
|
| 100 |
if not sentences:
|
| 101 |
logger.warning("No sentences found in section")
|
| 102 |
return {
|
| 103 |
+
"accuracy_score": 50.0,
|
| 104 |
"flagged_phrases": [],
|
| 105 |
"detailed_scores": {
|
| 106 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
|
|
|
| 117 |
"accurate headline"
|
| 118 |
]
|
| 119 |
|
| 120 |
+
logger.info("Checking headline for sensationalism...")
|
| 121 |
sensationalism_result = self.zero_shot(
|
| 122 |
headline,
|
| 123 |
sensationalism_categories,
|
|
|
|
| 128 |
label: score
|
| 129 |
for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
|
| 130 |
}
|
| 131 |
+
logger.info(f"Sensationalism scores: {sensationalism_scores}")
|
| 132 |
|
| 133 |
# Filter relevant sentences (longer than 20 chars)
|
| 134 |
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 135 |
+
logger.info(f"Found {len(relevant_sentences)} relevant sentences after filtering")
|
| 136 |
|
| 137 |
if not relevant_sentences:
|
| 138 |
logger.warning("No relevant sentences found in section")
|
| 139 |
return {
|
| 140 |
+
"accuracy_score": 50.0,
|
| 141 |
"flagged_phrases": [],
|
| 142 |
"detailed_scores": {
|
| 143 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
|
|
|
| 150 |
flagged_phrases = []
|
| 151 |
batch_size = 8
|
| 152 |
|
| 153 |
+
logger.info("Processing sentences for contradictions...")
|
| 154 |
for i in range(0, len(relevant_sentences), batch_size):
|
| 155 |
batch = relevant_sentences[i:i+batch_size]
|
| 156 |
batch_inputs = [f"{headline} [SEP] {sentence}" for sentence in batch]
|
|
|
|
| 165 |
scores = {item['label']: item['score'] for item in result}
|
| 166 |
nli_scores.append(scores)
|
| 167 |
|
| 168 |
+
# Flag contradictory content with lower threshold
|
| 169 |
+
if scores.get('CONTRADICTION', 0) > 0.3: # Lowered threshold
|
| 170 |
+
logger.info(f"Found contradictory sentence (score: {scores['CONTRADICTION']:.2f}): {sentence}")
|
| 171 |
flagged_phrases.append({
|
| 172 |
'text': sentence,
|
| 173 |
'type': 'Contradiction',
|
| 174 |
'score': scores['CONTRADICTION'],
|
| 175 |
'highlight': f"[CONTRADICTION] (Score: {round(scores['CONTRADICTION'] * 100, 1)}%) \"{sentence}\""
|
| 176 |
})
|
| 177 |
+
|
| 178 |
+
# Flag highly sensationalized content
|
| 179 |
+
if sensationalism_scores.get('sensationalized', 0) > 0.6 or sensationalism_scores.get('clickbait', 0) > 0.6:
|
| 180 |
+
logger.info(f"Found sensationalized content: {sentence}")
|
| 181 |
+
flagged_phrases.append({
|
| 182 |
+
'text': sentence,
|
| 183 |
+
'type': 'Sensationalized',
|
| 184 |
+
'score': max(sensationalism_scores.get('sensationalized', 0), sensationalism_scores.get('clickbait', 0)),
|
| 185 |
+
'highlight': f"[SENSATIONALIZED] \"{sentence}\""
|
| 186 |
+
})
|
| 187 |
|
| 188 |
except Exception as batch_error:
|
| 189 |
logger.warning(f"Batch processing error: {str(batch_error)}")
|
|
|
|
| 202 |
]))
|
| 203 |
for label in ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']
|
| 204 |
}
|
| 205 |
+
logger.info(f"Average NLI scores: {avg_scores}")
|
| 206 |
except Exception as agg_error:
|
| 207 |
logger.error(f"Error aggregating NLI scores: {str(agg_error)}")
|
| 208 |
avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
|
|
|
|
| 222 |
) * 0.15
|
| 223 |
}
|
| 224 |
|
| 225 |
+
logger.info(f"Accuracy components: {accuracy_components}")
|
| 226 |
accuracy_score = sum(accuracy_components.values()) * 100
|
| 227 |
|
| 228 |
# Validate final score
|
|
|
|
| 231 |
accuracy_score = 50.0
|
| 232 |
else:
|
| 233 |
accuracy_score = float(accuracy_score)
|
| 234 |
+
logger.info(f"Final accuracy score: {accuracy_score:.1f}")
|
| 235 |
|
| 236 |
except Exception as score_error:
|
| 237 |
logger.error(f"Error calculating accuracy score: {str(score_error)}")
|
|
|
|
| 253 |
if len(unique_phrases) >= 5:
|
| 254 |
break
|
| 255 |
|
| 256 |
+
logger.info(f"Final number of flagged phrases: {len(unique_phrases)}")
|
| 257 |
+
|
| 258 |
return {
|
| 259 |
"accuracy_score": accuracy_score,
|
| 260 |
"flagged_phrases": unique_phrases,
|
|
|
|
| 267 |
except Exception as e:
|
| 268 |
logger.error(f"Section analysis failed: {str(e)}")
|
| 269 |
return {
|
| 270 |
+
"accuracy_score": 50.0,
|
| 271 |
"flagged_phrases": [],
|
| 272 |
"detailed_scores": {
|
| 273 |
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
mediaunmasked/analyzers/scoring.py
CHANGED
|
@@ -115,24 +115,47 @@ class MediaScorer:
|
|
| 115 |
def calculate_media_score(self, headline: str, content: str) -> Dict[str, Any]:
|
| 116 |
"""Calculate final media credibility score."""
|
| 117 |
try:
|
| 118 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
headline_analysis = self.headline_analyzer.analyze(headline, content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
sentiment_analysis = self.sentiment_analyzer.analyze(content)
|
| 122 |
-
|
| 123 |
-
|
|
|
|
| 124 |
|
| 125 |
-
#
|
| 126 |
-
logger.info("\n
|
| 127 |
-
logger.info(
|
| 128 |
-
logger.info(
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
Label: {bias_analysis
|
| 132 |
-
Score: {bias_analysis
|
| 133 |
-
Percentage: {bias_analysis
|
|
|
|
| 134 |
""")
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
# Calculate component scores with NaN handling
|
| 138 |
# For headline: 20% contradiction = 20% score (don't invert)
|
|
|
|
| 115 |
def calculate_media_score(self, headline: str, content: str) -> Dict[str, Any]:
|
| 116 |
"""Calculate final media credibility score."""
|
| 117 |
try:
|
| 118 |
+
logger.info("\n" + "="*50)
|
| 119 |
+
logger.info("MEDIA SCORE CALCULATION STARTED")
|
| 120 |
+
logger.info("="*50)
|
| 121 |
+
logger.info(f"Analysis Mode: {self.analysis_mode}")
|
| 122 |
|
| 123 |
+
# Headline Analysis
|
| 124 |
+
logger.info("\n" + "-"*30)
|
| 125 |
+
logger.info("HEADLINE ANALYSIS")
|
| 126 |
+
logger.info("-"*30)
|
| 127 |
headline_analysis = self.headline_analyzer.analyze(headline, content)
|
| 128 |
+
logger.info(f"Headline Score: {headline_analysis.get('headline_vs_content_score', 0)}")
|
| 129 |
+
logger.info(f"Flagged Phrases: {headline_analysis.get('flagged_phrases', [])}")
|
| 130 |
+
|
| 131 |
+
# Sentiment Analysis
|
| 132 |
+
logger.info("\n" + "-"*30)
|
| 133 |
+
logger.info("SENTIMENT ANALYSIS")
|
| 134 |
+
logger.info("-"*30)
|
| 135 |
sentiment_analysis = self.sentiment_analyzer.analyze(content)
|
| 136 |
+
logger.info(f"Sentiment: {sentiment_analysis.get('sentiment', 'Unknown')}")
|
| 137 |
+
logger.info(f"Manipulation Score: {sentiment_analysis.get('manipulation_score', 0)}")
|
| 138 |
+
logger.info(f"Flagged Phrases: {sentiment_analysis.get('flagged_phrases', [])}")
|
| 139 |
|
| 140 |
+
# Bias Analysis
|
| 141 |
+
logger.info("\n" + "-"*30)
|
| 142 |
+
logger.info("BIAS ANALYSIS")
|
| 143 |
+
logger.info("-"*30)
|
| 144 |
+
bias_analysis = self.bias_analyzer.analyze(content)
|
| 145 |
+
logger.info(f"""Bias Results:
|
| 146 |
+
Label: {bias_analysis.get('bias', 'Unknown')}
|
| 147 |
+
Score: {bias_analysis.get('bias_score', 0)}
|
| 148 |
+
Percentage: {bias_analysis.get('bias_percentage', 0)}%
|
| 149 |
+
Flagged Phrases: {bias_analysis.get('flagged_phrases', [])}
|
| 150 |
""")
|
| 151 |
+
|
| 152 |
+
# Evidence Analysis
|
| 153 |
+
logger.info("\n" + "-"*30)
|
| 154 |
+
logger.info("EVIDENCE ANALYSIS")
|
| 155 |
+
logger.info("-"*30)
|
| 156 |
+
evidence_analysis = self.evidence_analyzer.analyze(content)
|
| 157 |
+
logger.info(f"Evidence Score: {evidence_analysis.get('evidence_based_score', 0)}")
|
| 158 |
+
logger.info(f"Flagged Phrases: {evidence_analysis.get('flagged_phrases', [])}")
|
| 159 |
|
| 160 |
# Calculate component scores with NaN handling
|
| 161 |
# For headline: 20% contradiction = 20% score (don't invert)
|
mediaunmasked/analyzers/sentiment_analyzer.py
CHANGED
|
@@ -85,18 +85,21 @@ class SentimentAnalyzer:
|
|
| 85 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 86 |
"""Perform sentiment analysis using LLM models."""
|
| 87 |
try:
|
| 88 |
-
logger.info("
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Clean the text of formatting markers
|
|
|
|
| 91 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 92 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 93 |
if not line.startswith('[') and not line.startswith('More on'))
|
| 94 |
|
| 95 |
-
logger.info("Text
|
| 96 |
|
| 97 |
# Split text into chunks of 512 tokens (approximate)
|
| 98 |
chunks = [cleaned_text[i:i+2000] for i in range(0, len(cleaned_text), 2000)]
|
| 99 |
-
logger.info(f"
|
| 100 |
|
| 101 |
# Initialize aggregation variables
|
| 102 |
sentiment_scores = []
|
|
@@ -114,40 +117,42 @@ class SentimentAnalyzer:
|
|
| 114 |
|
| 115 |
# Process each chunk
|
| 116 |
for i, chunk in enumerate(chunks, 1):
|
|
|
|
| 117 |
logger.info(f"Processing chunk {i}/{len(chunks)}")
|
|
|
|
| 118 |
|
| 119 |
try:
|
| 120 |
-
# Get emotion scores
|
| 121 |
-
logger.
|
| 122 |
emotions = self.sentiment_pipeline(chunk)
|
| 123 |
logger.debug(f"Raw emotion response: {emotions}")
|
| 124 |
|
| 125 |
# Handle different response formats
|
| 126 |
if isinstance(emotions, list):
|
| 127 |
-
# Multiple results format
|
| 128 |
for emotion in emotions:
|
| 129 |
if isinstance(emotion, dict) and 'label' in emotion and 'score' in emotion:
|
| 130 |
sentiment_scores.append(emotion)
|
|
|
|
| 131 |
elif isinstance(emotions, dict) and 'label' in emotions and 'score' in emotions:
|
| 132 |
-
# Single result format
|
| 133 |
sentiment_scores.append(emotions)
|
| 134 |
-
|
| 135 |
|
| 136 |
# Get toxicity scores if available
|
| 137 |
if self.toxicity_available:
|
| 138 |
-
logger.
|
| 139 |
try:
|
| 140 |
toxicity = self.toxicity_pipeline(chunk)
|
| 141 |
if isinstance(toxicity, list):
|
| 142 |
toxicity_scores.extend(toxicity)
|
| 143 |
else:
|
| 144 |
toxicity_scores.append(toxicity)
|
| 145 |
-
logger.
|
|
|
|
| 146 |
except Exception as tox_error:
|
| 147 |
logger.warning(f"Toxicity analysis failed for chunk {i}: {str(tox_error)}")
|
| 148 |
|
| 149 |
# Get manipulation scores
|
| 150 |
-
logger.
|
| 151 |
manipulation = self.zero_shot(
|
| 152 |
chunk,
|
| 153 |
manipulation_categories,
|
|
@@ -155,13 +160,17 @@ class SentimentAnalyzer:
|
|
| 155 |
)
|
| 156 |
|
| 157 |
if isinstance(manipulation, dict) and 'labels' in manipulation and 'scores' in manipulation:
|
| 158 |
-
|
| 159 |
label: score
|
| 160 |
for label, score in zip(manipulation['labels'], manipulation['scores'])
|
| 161 |
-
}
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
# Analyze sentences for manipulation
|
|
|
|
| 165 |
sentences = chunk.split('.')
|
| 166 |
for sentence in sentences:
|
| 167 |
if len(sentence.strip()) > 10:
|
|
@@ -172,6 +181,7 @@ class SentimentAnalyzer:
|
|
| 172 |
)
|
| 173 |
if (sent_result['labels'][0] in ["emotional manipulation", "fear mongering", "propaganda"]
|
| 174 |
and sent_result['scores'][0] > 0.7):
|
|
|
|
| 175 |
flagged_phrases.append({
|
| 176 |
'text': sentence.strip(),
|
| 177 |
'type': sent_result['labels'][0],
|
|
@@ -182,7 +192,7 @@ class SentimentAnalyzer:
|
|
| 182 |
logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
|
| 183 |
continue
|
| 184 |
|
| 185 |
-
logger.info("
|
| 186 |
|
| 187 |
# Aggregate scores with error handling
|
| 188 |
def aggregate_scores(scores_list, score_type: str):
|
|
@@ -222,8 +232,15 @@ class SentimentAnalyzer:
|
|
| 222 |
|
| 223 |
emotion_scores = aggregate_scores(sentiment_scores, "emotion")
|
| 224 |
toxicity_scores = aggregate_scores(toxicity_scores, "toxicity") if self.toxicity_available else {}
|
| 225 |
-
|
| 226 |
-
logger.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
# Aggregate manipulation scores
|
| 229 |
manipulation_agg = {
|
|
@@ -232,9 +249,12 @@ class SentimentAnalyzer:
|
|
| 232 |
for scores in manipulation_scores
|
| 233 |
]))
|
| 234 |
for category in manipulation_categories
|
| 235 |
-
if manipulation_scores
|
| 236 |
}
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
# Calculate manipulation score based on multiple factors
|
| 240 |
manipulation_indicators = {
|
|
@@ -263,7 +283,7 @@ class SentimentAnalyzer:
|
|
| 263 |
# Fallback to traditional analysis if no scores available
|
| 264 |
manipulation_score = len(self._detect_manipulative_phrases(text)) * 10
|
| 265 |
|
| 266 |
-
logger.info(f"
|
| 267 |
|
| 268 |
# Determine overall sentiment
|
| 269 |
positive_emotions = ['admiration', 'joy', 'amusement', 'approval']
|
|
@@ -274,7 +294,10 @@ class SentimentAnalyzer:
|
|
| 274 |
neg_score = sum(emotion_scores.get(emotion, 0) for emotion in negative_emotions)
|
| 275 |
neu_score = sum(emotion_scores.get(emotion, 0) for emotion in neutral_emotions)
|
| 276 |
|
| 277 |
-
logger.
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
# Determine sentiment based on highest score
|
| 280 |
max_score = max(pos_score, neg_score, neu_score)
|
|
@@ -285,7 +308,7 @@ class SentimentAnalyzer:
|
|
| 285 |
else:
|
| 286 |
sentiment = "Neutral"
|
| 287 |
|
| 288 |
-
logger.info(f"
|
| 289 |
|
| 290 |
# Sort and limit flagged phrases by manipulation score
|
| 291 |
sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
|
|
@@ -299,7 +322,9 @@ class SentimentAnalyzer:
|
|
| 299 |
if len(unique_phrases) >= 5:
|
| 300 |
break
|
| 301 |
|
| 302 |
-
logger.info("
|
|
|
|
|
|
|
| 303 |
|
| 304 |
return {
|
| 305 |
"sentiment": sentiment,
|
|
|
|
| 85 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
| 86 |
"""Perform sentiment analysis using LLM models."""
|
| 87 |
try:
|
| 88 |
+
logger.info("\n" + "="*50)
|
| 89 |
+
logger.info("SENTIMENT ANALYSIS STARTED")
|
| 90 |
+
logger.info("="*50)
|
| 91 |
|
| 92 |
# Clean the text of formatting markers
|
| 93 |
+
logger.info("Cleaning and preparing text...")
|
| 94 |
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
| 95 |
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
| 96 |
if not line.startswith('[') and not line.startswith('More on'))
|
| 97 |
|
| 98 |
+
logger.info(f"Text prepared - Length: {len(cleaned_text)} characters")
|
| 99 |
|
| 100 |
# Split text into chunks of 512 tokens (approximate)
|
| 101 |
chunks = [cleaned_text[i:i+2000] for i in range(0, len(cleaned_text), 2000)]
|
| 102 |
+
logger.info(f"Split text into {len(chunks)} chunks for processing")
|
| 103 |
|
| 104 |
# Initialize aggregation variables
|
| 105 |
sentiment_scores = []
|
|
|
|
| 117 |
|
| 118 |
# Process each chunk
|
| 119 |
for i, chunk in enumerate(chunks, 1):
|
| 120 |
+
logger.info(f"\n{'-'*30}")
|
| 121 |
logger.info(f"Processing chunk {i}/{len(chunks)}")
|
| 122 |
+
logger.info(f"Chunk length: {len(chunk)} characters")
|
| 123 |
|
| 124 |
try:
|
| 125 |
+
# Get emotion scores
|
| 126 |
+
logger.info("Analyzing emotions...")
|
| 127 |
emotions = self.sentiment_pipeline(chunk)
|
| 128 |
logger.debug(f"Raw emotion response: {emotions}")
|
| 129 |
|
| 130 |
# Handle different response formats
|
| 131 |
if isinstance(emotions, list):
|
|
|
|
| 132 |
for emotion in emotions:
|
| 133 |
if isinstance(emotion, dict) and 'label' in emotion and 'score' in emotion:
|
| 134 |
sentiment_scores.append(emotion)
|
| 135 |
+
logger.info(f"Detected emotion: {emotion['label']} (score: {emotion['score']:.3f})")
|
| 136 |
elif isinstance(emotions, dict) and 'label' in emotions and 'score' in emotions:
|
|
|
|
| 137 |
sentiment_scores.append(emotions)
|
| 138 |
+
logger.info(f"Detected emotion: {emotions['label']} (score: {emotions['score']:.3f})")
|
| 139 |
|
| 140 |
# Get toxicity scores if available
|
| 141 |
if self.toxicity_available:
|
| 142 |
+
logger.info("Analyzing toxicity...")
|
| 143 |
try:
|
| 144 |
toxicity = self.toxicity_pipeline(chunk)
|
| 145 |
if isinstance(toxicity, list):
|
| 146 |
toxicity_scores.extend(toxicity)
|
| 147 |
else:
|
| 148 |
toxicity_scores.append(toxicity)
|
| 149 |
+
logger.info(f"Toxicity analysis complete for chunk {i}")
|
| 150 |
+
logger.debug(f"Toxicity scores: {toxicity_scores[-1]}")
|
| 151 |
except Exception as tox_error:
|
| 152 |
logger.warning(f"Toxicity analysis failed for chunk {i}: {str(tox_error)}")
|
| 153 |
|
| 154 |
# Get manipulation scores
|
| 155 |
+
logger.info("Analyzing manipulation patterns...")
|
| 156 |
manipulation = self.zero_shot(
|
| 157 |
chunk,
|
| 158 |
manipulation_categories,
|
|
|
|
| 160 |
)
|
| 161 |
|
| 162 |
if isinstance(manipulation, dict) and 'labels' in manipulation and 'scores' in manipulation:
|
| 163 |
+
chunk_scores = {
|
| 164 |
label: score
|
| 165 |
for label, score in zip(manipulation['labels'], manipulation['scores'])
|
| 166 |
+
}
|
| 167 |
+
manipulation_scores.append(chunk_scores)
|
| 168 |
+
logger.info("Manipulation scores for chunk:")
|
| 169 |
+
for label, score in chunk_scores.items():
|
| 170 |
+
logger.info(f" - {label}: {score:.3f}")
|
| 171 |
|
| 172 |
# Analyze sentences for manipulation
|
| 173 |
+
logger.info("Analyzing individual sentences for manipulation...")
|
| 174 |
sentences = chunk.split('.')
|
| 175 |
for sentence in sentences:
|
| 176 |
if len(sentence.strip()) > 10:
|
|
|
|
| 181 |
)
|
| 182 |
if (sent_result['labels'][0] in ["emotional manipulation", "fear mongering", "propaganda"]
|
| 183 |
and sent_result['scores'][0] > 0.7):
|
| 184 |
+
logger.info(f"Found manipulative content (score: {sent_result['scores'][0]:.3f}): {sentence.strip()}")
|
| 185 |
flagged_phrases.append({
|
| 186 |
'text': sentence.strip(),
|
| 187 |
'type': sent_result['labels'][0],
|
|
|
|
| 192 |
logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
|
| 193 |
continue
|
| 194 |
|
| 195 |
+
logger.info("\nAggregating final scores...")
|
| 196 |
|
| 197 |
# Aggregate scores with error handling
|
| 198 |
def aggregate_scores(scores_list, score_type: str):
|
|
|
|
| 232 |
|
| 233 |
emotion_scores = aggregate_scores(sentiment_scores, "emotion")
|
| 234 |
toxicity_scores = aggregate_scores(toxicity_scores, "toxicity") if self.toxicity_available else {}
|
| 235 |
+
|
| 236 |
+
logger.info("\nFinal emotion scores:")
|
| 237 |
+
for emotion, score in emotion_scores.items():
|
| 238 |
+
logger.info(f" - {emotion}: {score:.3f}")
|
| 239 |
+
|
| 240 |
+
if toxicity_scores:
|
| 241 |
+
logger.info("\nFinal toxicity scores:")
|
| 242 |
+
for category, score in toxicity_scores.items():
|
| 243 |
+
logger.info(f" - {category}: {score:.3f}")
|
| 244 |
|
| 245 |
# Aggregate manipulation scores
|
| 246 |
manipulation_agg = {
|
|
|
|
| 249 |
for scores in manipulation_scores
|
| 250 |
]))
|
| 251 |
for category in manipulation_categories
|
| 252 |
+
if manipulation_scores
|
| 253 |
}
|
| 254 |
+
|
| 255 |
+
logger.info("\nFinal manipulation scores:")
|
| 256 |
+
for category, score in manipulation_agg.items():
|
| 257 |
+
logger.info(f" - {category}: {score:.3f}")
|
| 258 |
|
| 259 |
# Calculate manipulation score based on multiple factors
|
| 260 |
manipulation_indicators = {
|
|
|
|
| 283 |
# Fallback to traditional analysis if no scores available
|
| 284 |
manipulation_score = len(self._detect_manipulative_phrases(text)) * 10
|
| 285 |
|
| 286 |
+
logger.info(f"\nFinal manipulation score: {manipulation_score:.1f}")
|
| 287 |
|
| 288 |
# Determine overall sentiment
|
| 289 |
positive_emotions = ['admiration', 'joy', 'amusement', 'approval']
|
|
|
|
| 294 |
neg_score = sum(emotion_scores.get(emotion, 0) for emotion in negative_emotions)
|
| 295 |
neu_score = sum(emotion_scores.get(emotion, 0) for emotion in neutral_emotions)
|
| 296 |
|
| 297 |
+
logger.info(f"\nSentiment component scores:")
|
| 298 |
+
logger.info(f" - Positive: {pos_score:.3f}")
|
| 299 |
+
logger.info(f" - Negative: {neg_score:.3f}")
|
| 300 |
+
logger.info(f" - Neutral: {neu_score:.3f}")
|
| 301 |
|
| 302 |
# Determine sentiment based on highest score
|
| 303 |
max_score = max(pos_score, neg_score, neu_score)
|
|
|
|
| 308 |
else:
|
| 309 |
sentiment = "Neutral"
|
| 310 |
|
| 311 |
+
logger.info(f"\nFinal sentiment determination: {sentiment}")
|
| 312 |
|
| 313 |
# Sort and limit flagged phrases by manipulation score
|
| 314 |
sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
|
|
|
|
| 322 |
if len(unique_phrases) >= 5:
|
| 323 |
break
|
| 324 |
|
| 325 |
+
logger.info(f"\nFlagged {len(unique_phrases)} unique manipulative phrases")
|
| 326 |
+
|
| 327 |
+
logger.info("\nSentiment analysis completed successfully")
|
| 328 |
|
| 329 |
return {
|
| 330 |
"sentiment": sentiment,
|