Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import os | |
import json | |
import gzip | |
import re | |
from urllib.parse import quote, unquote | |
# Updated CSS styles to use default background | |
CUSTOM_CSS = """ | |
<style> | |
/* Set default background color */ | |
body { | |
background-color: white !important; | |
} | |
.stApp { | |
background-color: white !important; | |
} | |
h1 { | |
color: #2E4053; | |
font-family: 'Helvetica Neue', sans-serif; | |
font-size: 2.8rem !important; | |
border-bottom: 3px solid #3498DB; | |
padding-bottom: 0.3em; | |
} | |
h2, h3, h4 { | |
color: #2C3E50 !important; | |
font-family: 'Arial Rounded MT Bold', sans-serif; | |
} | |
.metric-card { | |
background: linear-gradient(145deg, #F8F9FA 0%, #FFFFFF 100%); | |
border-radius: 12px; | |
padding: 1.2rem; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); | |
border: 1px solid #E0E7FF; | |
transition: transform 0.2s; | |
} | |
.metric-card:hover { | |
transform: translateY(-2px); | |
} | |
.citation-badge:hover::after, | |
.influential-badge:hover::after { | |
content: attr(title); | |
position: absolute; | |
bottom: calc(100% + 5px); | |
left: 50%; | |
transform: translateX(-50%); | |
background-color: rgba(0, 0, 0, 0.8); | |
color: #fff; | |
padding: 5px 10px; | |
border-radius: 4px; | |
white-space: nowrap; | |
z-index: 100; | |
opacity: 0; | |
pointer-events: none; | |
transition: opacity 0.3s ease; | |
} | |
.citation-badge:hover::after, | |
.influential-badge:hover::after { | |
opacity: 1; | |
} | |
.path-nav { | |
color: #6C757D; | |
font-size: 0.95rem; | |
padding: 0.8rem 1rem; | |
background: #F8F9FA; | |
border-radius: 8px; | |
margin: 0.5rem 0; /* 减少上下margin */ | |
} | |
.stButton>button { | |
background: #3498DB !important; | |
color: white !important; | |
border-radius: 8px !important; | |
padding: 8px 20px !important; | |
border: none !important; | |
transition: all 0.3s !important; | |
} | |
.stButton>button:hover { | |
background: #2980B9 !important; | |
transform: scale(1.05); | |
box-shadow: 0 4px 8px rgba(52, 152, 219, 0.3); | |
} | |
.paper-card, .cluster-card { | |
background: white; | |
border-radius: 10px; | |
padding: 1.5rem; | |
margin: 1rem 0; | |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06); | |
border: 1px solid #EAEDF3; | |
overflow: hidden; | |
} | |
/* 调整标题的字号 - 增大cluster title */ | |
.paper-title, .cluster-title { | |
color: #2C3E50; | |
font-size: 1.3rem !important; /* 增大原来的字号 */ | |
font-weight: 700; /* 加粗 */ | |
margin-bottom: 0.5rem; | |
cursor: pointer; | |
} | |
.paper-abstract, .cluster-abstract { | |
color: #6C757D; | |
line-height: 1.6; | |
font-size: 0.95rem; | |
margin: 1rem 0; | |
padding: 0.8rem; | |
background: #F9FAFB; | |
border-radius: 8px; | |
border-left: 4px solid #3498DB; | |
} | |
/* 减少expander之间的间距 */ | |
.streamlit-expanderHeader { | |
font-weight: 600 !important; | |
color: #2C3E50 !important; | |
margin-top: 0.5rem !important; | |
margin-bottom: 0.5rem !important; | |
} | |
/* 调整expander的内部和外部间距 */ | |
.streamlit-expander { | |
margin-top: 0.5rem !important; | |
margin-bottom: 0.5rem !important; | |
} | |
/* 更紧凑的expander内容区 */ | |
.streamlit-expanderContent { | |
background: #FAFAFA; | |
border-radius: 0 0 8px 8px; | |
border: 1px solid #EAEDF3; | |
border-top: none; | |
padding: 8px 12px !important; /* 减少内部padding */ | |
} | |
/* Additional styles */ | |
.paper-section, .cluster-section { | |
margin-top: 20px; | |
padding: 15px; | |
border-radius: 8px; | |
background: #FAFAFA; | |
border-left: 4px solid #3498DB; | |
} | |
.paper-section-title, .cluster-section-title { | |
color: #2C3E50; | |
font-weight: 600; | |
margin-bottom: 10px; | |
border-bottom: 2px solid #EEE; | |
padding-bottom: 5px; | |
} | |
.section-problem { | |
border-left-color: #3498DB; | |
} | |
.section-solution { | |
border-left-color: #2ECC71; | |
} | |
.section-results { | |
border-left-color: #9B59B6; | |
} | |
.label { | |
font-weight: 600; | |
color: #34495E; | |
margin-bottom: 5px; | |
} | |
.value-box { | |
background: #F8F9FA; | |
padding: 10px; | |
border-radius: 5px; | |
margin-bottom: 10px; | |
font-size: 0.95rem; | |
color: #333; | |
line-height: 1.5; | |
} | |
/* Citation badge styles */ | |
.citation-badge, .influential-badge { | |
display: inline-flex; | |
align-items: center; | |
padding: 4px 8px; | |
border-radius: 6px; | |
font-size: 0.85rem; | |
font-weight: 600; | |
gap: 4px; | |
white-space: nowrap; | |
} | |
.citation-badge { | |
background: #EBF5FB; | |
color: #2980B9; | |
} | |
.influential-badge { | |
background: #FCF3CF; | |
color: #F39C12; | |
} | |
.citation-icon, .influential-icon { | |
font-size: 1rem; | |
} | |
/* 修改后的引用统计格式 */ | |
.citation-stats, .influential-stats { | |
display: flex; | |
align-items: center; | |
padding: 4px 12px; | |
border-radius: 6px; | |
font-size: 0.85rem; | |
margin-bottom: 6px; | |
white-space: nowrap; | |
} | |
.citation-stats { | |
background: #EBF5FB; | |
color: #2980B9; | |
} | |
.influential-stats { | |
background: #FCF3CF; | |
color: #F39C12; | |
} | |
.stats-divider { | |
margin: 0 6px; | |
color: rgba(0,0,0,0.2); | |
} | |
/* Field of study badge */ | |
.field-badge { | |
display: inline-block; | |
background: #F1F8E9; | |
color: #558B2F; | |
padding: 3px 10px; | |
border-radius: 16px; | |
font-size: 0.75rem; | |
font-weight: 500; | |
border: 1px solid #C5E1A5; | |
} | |
/* JSON value display */ | |
.json-value { | |
background: #F8F9FA; | |
padding: 10px; | |
border-radius: 6px; | |
margin-bottom: 10px; | |
white-space: pre-wrap; | |
font-family: monospace; | |
font-size: 0.9rem; | |
line-height: 1.5; | |
color: #2C3E50; | |
overflow-x: auto; | |
} | |
/* Collapsible content */ | |
.cluster-content { | |
display: none; | |
} | |
.cluster-content.show { | |
display: block; | |
} | |
/* 重新设计集群标题区布局 */ | |
.cluster-header { | |
display: flex; | |
flex-wrap: wrap; | |
justify-content: space-between; | |
align-items: center; | |
padding-bottom: 10px; | |
border-bottom: 1px solid #eee; | |
margin-bottom: 0px; | |
} | |
/* 左侧标题和集群信息 */ | |
.cluster-header-left { | |
display: flex; | |
align-items: center; | |
flex: 1; | |
min-width: 200px; | |
} | |
/* 中间区域用于摘要展开器 */ | |
.cluster-header-middle { | |
display: flex; | |
flex: 0 0 auto; | |
margin: 0 15px; | |
} | |
/* 右侧统计数据 */ | |
.cluster-badge-container { | |
display: flex; | |
flex-wrap: wrap; | |
gap: 6px; | |
justify-content: flex-end; | |
} | |
/* 子集群查看按钮 */ | |
.view-button { | |
margin-left: 15px; | |
} | |
/* 调整h3标题的上下margin */ | |
h3 { | |
margin-top: 1rem !important; | |
margin-bottom: 0.5rem !important; | |
} | |
/* 调整内容区块的上下margin */ | |
.stBlock { | |
margin-top: 0.5rem !important; | |
margin-bottom: 0.5rem !important; | |
} | |
/* 内联expander按钮样式 */ | |
.inline-expander-button { | |
background: #E3F2FD; | |
border: 1px solid #BBDEFB; | |
border-radius: 4px; | |
padding: 4px 8px; | |
font-size: 0.85rem; | |
color: #1976D2; | |
cursor: pointer; | |
display: inline-flex; | |
align-items: center; | |
transition: all 0.2s; | |
} | |
.inline-expander-button:hover { | |
background: #BBDEFB; | |
} | |
/* 导航路径中的按钮样式 */ | |
.path-nav-button { | |
display: inline-block; | |
margin: 0 5px; | |
padding: 5px 10px; | |
background: #E3F2FD; | |
border-radius: 5px; | |
color: #1976D2; | |
cursor: pointer; | |
font-weight: 500; | |
font-size: 0.9rem; | |
border: none; | |
transition: all 0.2s; | |
} | |
.path-nav-button:hover { | |
background: #BBDEFB; | |
} | |
/* 路径导航容器样式 */ | |
.path-nav { | |
color: #6C757D; | |
font-size: 0.95rem; | |
padding: 0.8rem 1rem; | |
background: #F8F9FA; | |
border-radius: 8px; | |
margin: 0.8rem 0; | |
} | |
/* Paper count badge style */ | |
.paper-count-badge { | |
display: inline-flex; | |
align-items: center; | |
margin-left: 12px; | |
background: #E8F4FD; | |
color: #2980B9; | |
padding: 3px 8px; | |
border-radius: 12px; | |
font-size: 0.85rem; | |
font-weight: 500; | |
} | |
</style> | |
<script> | |
function toggleClusterContent(id) { | |
const content = document.getElementById('cluster-content-' + id); | |
if (content) { | |
content.classList.toggle('show'); | |
} | |
} | |
</script> | |
""" | |
def get_hierarchy_files(): | |
hierarchy_dir = 'hierarchies' | |
if not os.path.exists(hierarchy_dir): | |
return [] | |
files = [f for f in os.listdir(hierarchy_dir) if f.endswith('.json')] | |
print(f"Found files: {files}") | |
return files | |
def parse_filename(filename): | |
"""Parse hierarchy filename to extract metadata using improved patterns.""" | |
filename = filename.replace('.json', '') | |
parts = filename.split('_') | |
# Basic fields that should be consistent | |
if len(parts) < 6: | |
return { | |
'date': 'Unknown', | |
'embedder': 'Unknown', | |
'summarizer': 'Unknown', | |
'clustermethod': 'Unknown', | |
'contribution_type': 'Unknown', | |
'building_method': 'Unknown', | |
'clusterlevel': 'Unknown', | |
'clusterlevel_array': [], | |
'level_count': 0, | |
'random_seed': 'Unknown' | |
} | |
# These are consistent across formats | |
date_str = parts[1] | |
embedder = parts[2] | |
summarizer = parts[3] | |
clustermethod = parts[4] | |
# parts[5] is typically "emb" placeholder | |
contribution_type = parts[6] | |
# Special handling for building methods | |
# Check for compound building methods | |
building_method = None | |
clusterlevel_str = None | |
seed = None | |
# Handle different cases for building method and what follows | |
if len(parts) > 7: | |
if parts[7] == "bidirectional": | |
building_method = "bidirectional" | |
if len(parts) > 8: | |
# The cluster level is next | |
clusterlevel_str = parts[8] | |
if len(parts) > 9: | |
seed = parts[9] | |
elif parts[7] == "top" and len(parts) > 8 and parts[8] == "down": | |
building_method = "top_down" | |
if len(parts) > 9: | |
clusterlevel_str = parts[9] | |
if len(parts) > 10: | |
seed = parts[10] | |
elif parts[7] == "bottom" and len(parts) > 8 and parts[8] == "up": | |
building_method = "bottom_up" | |
if len(parts) > 9: | |
clusterlevel_str = parts[9] | |
if len(parts) > 10: | |
seed = parts[10] | |
# Default case - building method is not compound | |
else: | |
building_method = parts[7] | |
if len(parts) > 8: | |
clusterlevel_str = parts[8] | |
if len(parts) > 9: | |
seed = parts[9] | |
# Format date with slashes for better readability | |
formatted_date = f"{date_str[:4]}/{date_str[4:6]}/{date_str[6:]}" if len(date_str) == 8 else date_str | |
# Process cluster levels | |
clusterlevel_array = clusterlevel_str.split('-') if clusterlevel_str else [] | |
level_count = len(clusterlevel_array) | |
return { | |
'date': formatted_date, | |
'embedder': embedder, | |
'summarizer': summarizer, | |
'clustermethod': clustermethod, | |
'contribution_type': contribution_type, | |
'building_method': building_method or 'Unknown', | |
'clusterlevel': clusterlevel_str or 'Unknown', | |
'clusterlevel_array': clusterlevel_array, | |
'level_count': level_count, | |
'random_seed': seed or 'Unknown' | |
} | |
def format_hierarchy_option(filename): | |
info = parse_filename(filename) | |
levels_str = "×".join(info['clusterlevel_array']) | |
return f"{info['date']} - {info['clustermethod']} ({info['embedder']}/{info['summarizer']}, {info['contribution_type']}, {info['building_method']}, {info['level_count']} levels: {levels_str}, seed: {info['random_seed']})" | |
def load_hierarchy_data(filename): | |
"""Load hierarchy data with support for compressed files""" | |
filepath = os.path.join('hierarchies', filename) | |
# 检查是否存在未压缩版本 | |
if os.path.exists(filepath): | |
with open(filepath, 'r') as f: | |
return json.load(f) | |
# 检查是否存在 gzip 压缩版本 | |
gzip_filepath = filepath + '.gz' | |
if os.path.exists(gzip_filepath): | |
try: | |
with gzip.open(gzip_filepath, 'rt') as f: | |
return json.load(f) | |
except Exception as e: | |
st.error(f"Error loading compressed file {gzip_filepath}: {str(e)}") | |
return {"clusters": []} | |
st.error(f"Could not find hierarchy file: {filepath} or {gzip_filepath}") | |
return {"clusters": []} | |
def get_cluster_statistics(clusters): | |
"""获取集群统计信息,包括悬停提示""" | |
def count_papers(node): | |
if "children" not in node: | |
return 0 | |
children = node["children"] | |
if not children: | |
return 0 | |
if "paper_id" in children[0]: | |
return len(children) | |
return sum(count_papers(child) for child in children) | |
cluster_count = len(clusters) | |
paper_counts = [] | |
for cluster, _ in clusters: | |
paper_count = count_papers(cluster) | |
paper_counts.append(paper_count) | |
if paper_counts: | |
total_papers = sum(paper_counts) | |
average_papers = total_papers / cluster_count if cluster_count > 0 else 0 | |
return { | |
'Total Clusters': {'value': cluster_count, 'tooltip': 'Total number of clusters at this level'}, | |
'Total Papers': {'value': total_papers, 'tooltip': 'Total number of papers across all clusters at this level'}, | |
'Average Papers per Cluster': {'value': round(average_papers, 2), 'tooltip': 'Average number of papers per cluster'}, | |
'Median Papers': {'value': round(np.median(paper_counts), 2), 'tooltip': 'Median number of papers per cluster'}, | |
'Standard Deviation': {'value': round(np.std(paper_counts), 2), 'tooltip': 'Standard deviation of paper counts across clusters'}, | |
'Max Papers in Cluster': {'value': max(paper_counts), 'tooltip': 'Maximum number of papers in any single cluster'}, | |
'Min Papers in Cluster': {'value': min(paper_counts), 'tooltip': 'Minimum number of papers in any single cluster'} | |
} | |
return { | |
'Total Clusters': {'value': cluster_count, 'tooltip': 'Total number of clusters at this level'}, | |
'Total Papers': {'value': 0, 'tooltip': 'Total number of papers across all clusters at this level'}, | |
'Average Papers per Cluster': {'value': 0, 'tooltip': 'Average number of papers per cluster'}, | |
'Median Papers': {'value': 0, 'tooltip': 'Median number of papers per cluster'}, | |
'Standard Deviation': {'value': 0, 'tooltip': 'Standard deviation of paper counts across clusters'}, | |
'Max Papers in Cluster': {'value': 0, 'tooltip': 'Maximum number of papers in any single cluster'}, | |
'Min Papers in Cluster': {'value': 0, 'tooltip': 'Minimum number of papers in any single cluster'} | |
} | |
def calculate_citation_metrics(node): | |
"""Calculate total, average, and maximum citation and influential citation counts for a cluster.""" | |
total_citations = 0 | |
total_influential_citations = 0 | |
paper_count = 0 | |
citation_values = [] # 存储每篇论文的引用数 | |
influential_citation_values = [] # 存储每篇论文的有影响力引用数 | |
def process_node(n): | |
nonlocal total_citations, total_influential_citations, paper_count | |
if "children" not in n or n["children"] is None: | |
return | |
children = n["children"] | |
if not children: | |
return | |
# If this node contains papers directly | |
if children and len(children) > 0 and isinstance(children[0], dict) and "paper_id" in children[0]: | |
for paper in children: | |
if not isinstance(paper, dict): | |
continue | |
semantic_scholar = paper.get('semantic_scholar', {}) or {} | |
citations = semantic_scholar.get('citationCount', 0) | |
influential_citations = semantic_scholar.get('influentialCitationCount', 0) | |
total_citations += citations | |
total_influential_citations += influential_citations | |
paper_count += 1 | |
citation_values.append(citations) | |
influential_citation_values.append(influential_citations) | |
else: | |
# Recursively process child clusters | |
for child in children: | |
if isinstance(child, dict): | |
process_node(child) | |
process_node(node) | |
# 计算平均值和最大值 | |
avg_citations = round(total_citations / paper_count, 2) if paper_count > 0 else 0 | |
avg_influential_citations = round(total_influential_citations / paper_count, 2) if paper_count > 0 else 0 | |
max_citations = max(citation_values) if citation_values else 0 | |
max_influential_citations = max(influential_citation_values) if influential_citation_values else 0 | |
return { | |
'total_citations': total_citations, | |
'avg_citations': avg_citations, | |
'max_citations': max_citations, | |
'total_influential_citations': total_influential_citations, | |
'avg_influential_citations': avg_influential_citations, | |
'max_influential_citations': max_influential_citations, | |
'paper_count': paper_count | |
} | |
def find_clusters_in_path(data, path): | |
"""Find clusters or papers at the given path in the hierarchy.""" | |
if not data or "clusters" not in data: | |
return [] | |
clusters = data["clusters"] | |
current_clusters = [] | |
if not path: | |
return [(cluster, []) for cluster in clusters] | |
current = clusters | |
for i, p in enumerate(path): | |
found = False | |
for cluster in current: | |
if cluster.get("cluster_id") == p: | |
if "children" not in cluster or not cluster["children"]: | |
# No children found, return empty list | |
return [] | |
current = cluster["children"] | |
found = True | |
if i == len(path) - 1: | |
# We're at the target level | |
if current and len(current) > 0 and isinstance(current[0], dict) and "paper_id" in current[0]: | |
# This level contains papers | |
return [(paper, path) for paper in current] | |
else: | |
# This level contains subclusters | |
current_clusters = [] | |
for c in current: | |
if isinstance(c, dict): | |
cluster_id = c.get("cluster_id") | |
if cluster_id is not None: | |
current_clusters.append((c, path + [cluster_id])) | |
return current_clusters | |
break | |
if not found: | |
# Path segment not found | |
return [] | |
return current_clusters | |
def parse_json_abstract(abstract_text): | |
"""Parse JSON formatted abstract string into a beautifully formatted HTML string""" | |
try: | |
abstract_json = json.loads(abstract_text) | |
# Create a formatted display for the structured abstract | |
if "Problem" in abstract_json: | |
problem = abstract_json["Problem"] | |
return f""" | |
<div class='section-problem paper-section'> | |
<div class='paper-section-title'>Problem</div> | |
<div class='label'>Domain:</div> | |
<div class='value-box'>{problem.get('overarching problem domain', 'N/A')}</div> | |
<div class='label'>Challenges:</div> | |
<div class='value-box'>{problem.get('challenges/difficulties', 'N/A')}</div> | |
<div class='label'>Goal:</div> | |
<div class='value-box'>{problem.get('research question/goal', 'N/A')}</div> | |
</div> | |
""" | |
return abstract_text | |
except (json.JSONDecodeError, ValueError, TypeError): | |
# If not valid JSON, return the original text | |
return abstract_text | |
def display_path_details(path, data, level_count): | |
if not path: | |
return | |
st.markdown("### Path Details") | |
current = data["clusters"] | |
# Dynamically generate level labels and containers | |
for i, cluster_id in enumerate(path): | |
# 修改这里:使用 i + 1 作为层级编号 | |
level_number = i + 1 # 从1开始计算层级,顶层是Level 1 | |
indent = i * 32 # Indent 32 pixels per level | |
for c in current: | |
if c["cluster_id"] == cluster_id: | |
# Create a container with proper indentation | |
st.markdown(f""" | |
<div style='margin-left: {indent}px; margin-bottom: 10px;'> | |
</div> | |
""", unsafe_allow_html=True) | |
# Add extra spacing at the bottom | |
st.markdown("<div style='margin-bottom: 25px;'></div>", unsafe_allow_html=True) | |
# Create a row with cluster name and level button | |
col1, col2 = st.columns([0.85, 0.15]) | |
with col1: | |
st.markdown(f""" | |
<div style='display: flex; align-items: center;'> | |
<div style='width: 12px; height: 12px; | |
border-radius: 50%; background: #3B82F6; | |
margin-right: 8px;'></div> | |
<h4 style='font-size: 1.15rem; font-weight: 600; | |
color: #1F2937; margin: 0;'> | |
Cluster {c["cluster_id"]}: {c["title"]} | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
button_clicked = st.button(f'Level {level_number}', key=f'level_btn_{i}_{c["cluster_id"]}') | |
if button_clicked: | |
st.session_state.path = path[:i] | |
new_params = {} | |
new_params['hierarchy'] = st.query_params['hierarchy'] | |
if st.session_state.path: | |
new_params['path'] = st.session_state.path | |
st.query_params.clear() | |
for key, value in new_params.items(): | |
if isinstance(value, list): | |
for v in value: | |
st.query_params[key] = v | |
else: | |
st.query_params[key] = value | |
st.rerun() | |
# Calculate left margin for expander content to align with the header | |
# Use an extra container with margin to create the indentation | |
with st.container(): | |
st.markdown(f""" | |
<div style='margin-left: {indent}px; width: calc(100% - {indent}px);'> | |
</div> | |
""", unsafe_allow_html=True) | |
# Remove the key parameter that was causing the error | |
with st.expander("📄 Show Cluster Details", expanded=False): | |
# Parse abstract if it's in JSON format | |
abstract_content = parse_json_abstract(c["abstract"]) | |
st.markdown(f""" | |
<div style='color: #374151; line-height: 1.6;'> | |
{abstract_content} | |
</div> | |
""", unsafe_allow_html=True) | |
current = c["children"] | |
break | |
def display_paper(item): | |
"""Display detailed paper information including problem, solution, and results with semantic scholar info""" | |
# Check for semantic scholar data with proper fallbacks | |
semantic_scholar = item.get('semantic_scholar', {}) or {} | |
url = semantic_scholar.get('url', '') | |
citation_count = semantic_scholar.get('citationCount', 0) | |
influential_citation_count = semantic_scholar.get('influentialCitationCount', 0) | |
fields_of_study = semantic_scholar.get('fieldsOfStudy', []) or [] | |
# Generate field badges HTML | |
field_badges_html = "" | |
for field in fields_of_study: | |
field_badges_html += f"<span class='field-badge' title='Field of study'>{field}</span> " | |
# Basic information section with URL link and citation counts - Always visible | |
st.markdown(f""" | |
<div class='paper-card'> | |
<div style='display: flex; justify-content: space-between; align-items: flex-start;'> | |
<div class='paper-title' style='flex-grow: 1;'> | |
{item.get('title', 'Untitled Paper')} | |
<a href="{url}" target="_blank" | |
style='font-size: 0.9em; margin-left: 8px; | |
color: #3498DB; text-decoration: none; | |
transition: all 0.3s;' | |
title='View paper on Semantic Scholar'> | |
🔗 | |
</a> | |
</div> | |
<div style='display: flex; align-items: center; gap: 12px;'> | |
<div class='citation-badge' title='Number of times this paper has been cited by other papers.'> | |
<span class='citation-icon'>⭐</span> Citations: {citation_count} | |
</div> | |
<div class='influential-badge' title='Number of times this paper has been cited by influential papers. Influential citation means that the cited publication has a significant impact on the citing publication.'> | |
<span class='influential-icon'>🔥</span> Influential Citations: {influential_citation_count} | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# One main expander for all detailed information - Default collapsed | |
with st.expander("📑 Show Detailed Information", expanded=False): | |
# Abstract section | |
st.markdown(""" | |
<div style='margin-top: 15px; margin-bottom: 20px;'> | |
<h4 style='color: #2C3E50; border-bottom: 2px solid #3498DB; padding-bottom: 8px;'> | |
📄 Abstract | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
abstract_text = item.get('abstract', 'No abstract available') | |
st.markdown(f"<div class='paper-abstract'>{abstract_text}</div>", unsafe_allow_html=True) | |
# Problem section | |
if 'problem' in item and item['problem']: | |
st.markdown(""" | |
<div style='margin-top: 25px; margin-bottom: 20px;'> | |
<h4 style='color: #2C3E50; border-bottom: 2px solid #3498DB; padding-bottom: 8px;'> | |
🔍 Problem Details | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
problem = item['problem'] | |
cols = st.columns([1, 2]) | |
with cols[0]: | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'> | |
Problem Domain | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'> | |
Challenges/Difficulties | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'> | |
Research Question/Goal | |
</div> | |
""", unsafe_allow_html=True) | |
with cols[1]: | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #3498DB;'> | |
{problem.get('overarching problem domain', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #E74C3C; margin-top: 10px;'> | |
{problem.get('challenges/difficulties', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #2ECC71; margin-top: 10px;'> | |
{problem.get('research question/goal', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
# Solution section | |
if 'solution' in item and item['solution']: | |
st.markdown(""" | |
<div style='margin-top: 25px; margin-bottom: 20px;'> | |
<h4 style='color: #2C3E50; border-bottom: 2px solid #2ECC71; padding-bottom: 8px;'> | |
💡 Solution Details | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
solution = item['solution'] | |
cols = st.columns([1, 2]) | |
with cols[0]: | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'> | |
Solution Domain | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'> | |
Solution Approach | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'> | |
Novelty of Solution | |
</div> | |
""", unsafe_allow_html=True) | |
with cols[1]: | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #3498DB;'> | |
{solution.get('overarching solution domain', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #9B59B6; margin-top: 10px;'> | |
{solution.get('solution approach', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #F1C40F; margin-top: 10px;'> | |
{solution.get('novelty of the solution', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
# Results section | |
if 'results' in item and item['results']: | |
st.markdown(""" | |
<div style='margin-top: 25px; margin-bottom: 20px;'> | |
<h4 style='color: #2C3E50; border-bottom: 2px solid #9B59B6; padding-bottom: 8px;'> | |
📊 Results Details | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
results = item['results'] | |
cols = st.columns([1, 2]) | |
with cols[0]: | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'> | |
Findings/Results | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'> | |
Potential Impact | |
</div> | |
""", unsafe_allow_html=True) | |
with cols[1]: | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #3498DB;'> | |
{results.get('findings/results', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(f""" | |
<div style='background: #F8F9FA; padding: 10px; border-radius: 5px; | |
border-left: 4px solid #E67E22; margin-top: 10px;'> | |
{results.get('potential impact of the results', 'Not specified')} | |
</div> | |
""", unsafe_allow_html=True) | |
# Author information | |
if 'semantic_scholar' in item and item['semantic_scholar'] and 'authors' in item['semantic_scholar'] and item['semantic_scholar']['authors']: | |
st.markdown(""" | |
<div style='margin-top: 25px; margin-bottom: 20px;'> | |
<h4 style='color: #2C3E50; border-bottom: 2px solid #E67E22; padding-bottom: 8px;'> | |
👥 Authors | |
</h4> | |
</div> | |
""", unsafe_allow_html=True) | |
authors = item['semantic_scholar']['authors'] or [] | |
for author in authors: | |
if not isinstance(author, dict): | |
continue | |
st.markdown(f""" | |
<div style='display: flex; margin-bottom: 15px; padding-bottom: 10px; border-bottom: 1px solid #eee;'> | |
<div style='flex: 1;'> | |
<div style='font-weight: 600; font-size: 1.05rem;'>{author.get('name', 'Unknown')}</div> | |
<div style='color: #666; margin-top: 3px;'>Author ID: {author.get('authorId', 'N/A')}</div> | |
</div> | |
<div style='display: flex; gap: 15px;'> | |
<div title='Papers'> | |
<span style='font-size: 0.85rem; color: #666;'>Papers</span> | |
<div style='font-weight: 600; color: #3498DB;'>{author.get('paperCount', 0)}</div> | |
</div> | |
<div title='Citations'> | |
<span style='font-size: 0.85rem; color: #666;'>Citations</span> | |
<div style='font-weight: 600; color: #3498DB;'>{author.get('citationCount', 0)}</div> | |
</div> | |
<div title='h-index'> | |
<span style='font-size: 0.85rem; color: #666;'>h-index</span> | |
<div style='font-weight: 600; color: #3498DB;'>{author.get('hIndex', 0)}</div> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Close paper-card div | |
st.markdown("</div>", unsafe_allow_html=True) | |
def display_cluster(item, path): | |
"""Display a collapsible cluster with citation metrics integrated into the header, including abstract expander and buttons""" | |
# Generate a unique ID for this cluster for the expander functionality | |
cluster_id = item['cluster_id'] | |
unique_id = f"{cluster_id}_{'-'.join(map(str, path))}" | |
# Calculate citation metrics using the updated function | |
citation_metrics = calculate_citation_metrics(item) | |
# Parse the abstract | |
abstract_content = parse_json_abstract(item['abstract']) | |
# 根据是否包含子项来设置按钮文本和行为 | |
has_children = "children" in item and item["children"] | |
if has_children: | |
count = citation_metrics['paper_count'] if "paper_id" in item["children"][0] else len(item["children"]) | |
next_level_items = item["children"] | |
is_next_level_papers = len(next_level_items) > 0 and "paper_id" in next_level_items[0] | |
btn_text = f'View Papers ({count})' if is_next_level_papers else f'View Sub-clusters ({count})' | |
# 标题和论文数量显示 - 确保它们在同一水平线上 | |
st.markdown(f""" | |
<div style='display: flex; align-items: center;'> | |
<div class='cluster-title' style='margin: 0; font-weight: 700; font-size: 1.3rem;'> | |
{item['title']} | |
</div> | |
<div style='display: inline-flex; align-items: center; margin-left: 12px; | |
background: #F4F6F9; color: #566573; padding: 2px 10px; | |
border-radius: 6px; font-size: 0.95rem; font-weight: 500;'> | |
<span style='margin-right: 4px;'>📑</span>{citation_metrics['paper_count']} papers | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# 使用两列布局 | |
cols = st.columns([8, 2]) | |
with cols[0]: # 统计数据区域 | |
# 引用统计格式:使用管道符号分隔 | |
st.markdown(f""" | |
<div> | |
<div class='citation-stats'> | |
<span style='font-weight: bold; margin-right: 5px;'>⭐</span> Citations: | |
Total {citation_metrics['total_citations']} <span class='stats-divider'>|</span> | |
Avg {citation_metrics['avg_citations']} <span class='stats-divider'>|</span> | |
Max {citation_metrics['max_citations']} | |
</div> | |
<div class='influential-stats'> | |
<span style='font-weight: bold; margin-right: 5px;'>🔥</span> Influential Citations: | |
Total {citation_metrics['total_influential_citations']} <span class='stats-divider'>|</span> | |
Avg {citation_metrics['avg_influential_citations']} <span class='stats-divider'>|</span> | |
Max {citation_metrics['max_influential_citations']} | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# 创建摘要展开器 - 修改文本为"Cluster Summary" | |
with st.expander("📄 Cluster Summary", expanded=False): | |
st.markdown(f""" | |
<div class='cluster-abstract'>{abstract_content}</div> | |
""", unsafe_allow_html=True) | |
with cols[1]: # 查看按钮 | |
# 如果有子集群或论文,添加查看按钮 | |
if has_children: | |
# 使用动态生成的按钮文本,而不是固定的"View Sub-Cluster" | |
if st.button(btn_text, key=f"btn_{unique_id}"): | |
st.session_state.path.append(item['cluster_id']) | |
st.rerun() | |
# 创建一个分隔线 | |
st.markdown("<hr style='margin: 0.5rem 0; border-color: #eee;'>", unsafe_allow_html=True) | |
def main(): | |
st.set_page_config( | |
layout="wide", | |
page_title="Paper Clusters Explorer", | |
initial_sidebar_state="expanded", | |
menu_items=None | |
) | |
# 设置浅色主题 | |
st.markdown(""" | |
<script> | |
var elements = window.parent.document.querySelectorAll('.stApp'); | |
elements[0].classList.add('light'); | |
elements[0].classList.remove('dark'); | |
</script> | |
""", unsafe_allow_html=True) | |
st.markdown(CUSTOM_CSS, unsafe_allow_html=True) | |
hierarchy_files = get_hierarchy_files() | |
if not hierarchy_files: | |
st.error("No hierarchy files found in /hierarchies directory") | |
return | |
# Manage file selection via query params | |
current_url = st.query_params.get('hierarchy', None) | |
current_file = unquote(current_url) + '.json' if current_url else None | |
hierarchy_options = {format_hierarchy_option(f): f for f in hierarchy_files} | |
selected_option = st.selectbox( | |
'Select Hierarchy', | |
options=list(hierarchy_options.keys()), | |
index=list(hierarchy_options.values()).index(current_file) if current_file else 0 | |
) | |
selected_file = hierarchy_options[selected_option] | |
# Save selected file in query params | |
if selected_file != current_file: | |
st.query_params['hierarchy'] = quote(selected_file.replace('.json', '')) | |
data = load_hierarchy_data(selected_file) | |
info = parse_filename(selected_file) | |
# Hierarchy metadata and navigation state | |
with st.expander("📋 Hierarchy Metadata", expanded=False): | |
# Create a grid layout for metadata | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.markdown(f""" | |
<div class='metric-card'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Date</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['date']}</p> | |
</div> | |
<div class='metric-card' style='margin-top: 10px;'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Clustering Method</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['clustermethod']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
st.markdown(f""" | |
<div class='metric-card'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Embedder / Summarizer</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['embedder']} / {info['summarizer']}</p> | |
</div> | |
<div class='metric-card' style='margin-top: 10px;'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Contribution Type</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['contribution_type']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with col3: | |
st.markdown(f""" | |
<div class='metric-card'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Building Method</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['building_method']}</p> | |
</div> | |
<div class='metric-card' style='margin-top: 10px;'> | |
<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Cluster Levels</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['clusterlevel']} (Total: {info['level_count']})</p> | |
</div> | |
""", unsafe_allow_html=True) | |
if 'path' not in st.session_state: | |
path_params = st.query_params.get_all('path') | |
st.session_state.path = [p for p in path_params if p] | |
current_clusters = find_clusters_in_path(data, st.session_state.path) | |
current_level = len(st.session_state.path) | |
total_levels = info['level_count'] | |
level_name = f'Level {current_level + 1}' if current_level < total_levels else 'Papers' | |
is_paper_level = current_level >= total_levels or (current_clusters and "paper_id" in current_clusters[0][0]) | |
if not is_paper_level and current_clusters: | |
with st.expander("📊 Cluster Statistics", expanded=False): | |
stats = get_cluster_statistics(current_clusters) | |
# Create a 3x2 grid for six small metric cards | |
row1_col1, row1_col2, row1_col3 = st.columns(3) | |
row2_col1, row2_col2, row2_col3 = st.columns(3) | |
# Row 1 - First 3 metrics | |
with row1_col1: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Total Clusters</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Total Clusters']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with row1_col2: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Total Papers</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Total Papers']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with row1_col3: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Avg Papers/Cluster</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Average Papers per Cluster']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Row 2 - Next 3 metrics | |
with row2_col1: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Median Papers</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Median Papers']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with row2_col2: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Max Papers in Cluster</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Max Papers in Cluster']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with row2_col3: | |
st.markdown(f""" | |
<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'> | |
<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Min Papers in Cluster</h4> | |
<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Min Papers in Cluster']['value']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Back navigation button | |
if st.session_state.path: | |
if st.button('← Back', key='back_button'): | |
st.session_state.path.pop() | |
st.rerun() | |
# Current path display | |
if st.session_state.path: | |
# 获取路径上每个聚类的标题 | |
path_info = [] | |
current = data["clusters"] | |
# 构建路径中每个聚类的标题和层级信息 | |
for i, cid in enumerate(st.session_state.path): | |
level_num = i + 1 # 从1开始的层级编号 | |
for c in current: | |
if c["cluster_id"] == cid: | |
path_info.append((level_num, c["title"], c["cluster_id"])) | |
current = c["children"] | |
break | |
# 在Streamlit中创建路径导航 | |
with st.container(): | |
st.markdown("<h3 style='margin-top: 0.5rem; margin-bottom: 0.8rem;'>🗂️ Current Path</h3>", unsafe_allow_html=True) | |
# 🔝 添加 Root 入口 | |
col1, col2 = st.columns([0.3, 0.7]) | |
with col1: | |
st.markdown(f"<div><strong>Root:</strong></div>", unsafe_allow_html=True) | |
with col2: | |
if st.button("All Papers", key="root_button"): | |
st.session_state.path = [] | |
st.rerun() | |
# 使用缩进显示路径层次结构 | |
for i, (level_num, title, cluster_id) in enumerate(path_info): | |
col1, col2 = st.columns([0.3, 0.7]) | |
with col1: | |
st.markdown(f"<div><strong>Level {level_num}:</strong></div>", unsafe_allow_html=True) | |
with col2: | |
# 创建用于返回到该级别的按钮 | |
if st.button(f"{title}", key=f"lvl_{i}_{cluster_id}"): | |
# 当按钮被点击时,将路径截断到该级别 | |
st.session_state.path = st.session_state.path[:i+1] | |
st.rerun() | |
# 内容展示标题 | |
st.markdown(f""" | |
<h3 style='margin: 1rem 0 0.5rem 0; color: #2C3E50;'> | |
{'📑 Papers' if is_paper_level else '📂 ' + level_name} | |
</h3> | |
""", unsafe_allow_html=True) | |
for item, full_path in current_clusters: | |
if is_paper_level: | |
display_paper(item) | |
else: | |
display_cluster(item, full_path) | |
if __name__ == '__main__': | |
main() |