Spaces:

Agents-MCP-Hackathon
/

KnowledgeBridge

Running

App Files Files Community

KnowledgeBridge / server /seed-documents.ts

fazeel007

Fix indexing issue

a8d30b4 18 days ago

raw

history blame contribute delete

58.3 kB

	/**
	* Seed the database with default research papers for demo purposes
	*/
	import { storage } from './storage';
	import { type InsertDocument } from '@shared/schema';

	const defaultPapers: Omit<InsertDocument, 'id' \| 'createdAt'>[] = [
	// 🧠 Foundation & Scaling Laws
	{
	title: "Attention Is All You Need",
	content: `The Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.

	We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature.

	The attention mechanism allows the model to make connections between distant elements of the sequence, capturing long-range dependencies that are crucial for understanding context. This makes the Transformer particularly effective for tasks requiring understanding of global context, such as machine translation and text summarization.`,
	source: "Google Research, Vaswani et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/1706.03762",
	metadata: {
	authors: ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"],
	year: 2017,
	venue: "NIPS",
	citations: 85000,
	keywords: ["attention mechanism", "transformer", "neural networks", "machine translation", "deep learning"],
	theme: "Foundation & Scaling Laws"
	},
	embedding: null
	},
	{
	title: "GPT-4 Technical Report",
	content: `We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers.

	GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales.

	The model demonstrates remarkable capabilities across diverse domains, from creative writing to complex reasoning tasks. It can understand and generate code in multiple programming languages, solve mathematical problems, and engage in sophisticated conversations while maintaining context over long interactions. The multimodal capabilities allow it to analyze images and describe their contents, making it a powerful tool for various applications.`,
	source: "OpenAI",
	sourceType: "research",
	url: "https://arxiv.org/abs/2303.08774",
	metadata: {
	authors: ["OpenAI"],
	year: 2023,
	venue: "arXiv",
	citations: 15000,
	keywords: ["GPT-4", "large language model", "multimodal", "AI safety", "alignment"],
	theme: "Foundation & Scaling Laws"
	},
	embedding: null
	},
	{
	title: "Constitutional AI: Harmlessness from AI Feedback",
	content: `As AI systems become more capable, we would like to enlist their help to supervise other AI systems. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as Constitutional AI.

	The process involves both a supervised learning and a reinforcement learning phase. In the supervised phase, we sample from an initial model, use the model itself to critique and revise its response, and then train on the revised responses. In the RL phase, we use the model to evaluate which of two samples is better, and then do RL from these AI feedback labels.

	We find this approach can train a non-evasive and non-manipulative AI assistant that is helpful, harmless, and honest. The constitutional approach enables the training of AI systems that are robust to adversarial prompts and maintain their beneficial behavior even under stress testing. This represents a significant step toward creating AI systems that can be safely deployed in real-world applications.`,
	source: "Anthropic, Bai et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2212.08073",
	metadata: {
	authors: ["Yuntao Bai", "Andy Jones", "Kamal Ndousse", "Amanda Askell", "Anna Chen", "Nova DasSarma", "Dawn Drain", "Stanislav Fort", "Deep Ganguli", "Tom Henighan", "Nicholas Joseph", "Saurav Kadavath", "Jackson Kernion", "Tom Conerly", "Sheer El-Showk", "Nelson Elhage", "Zac Hatfield-Dodds", "Danny Hernandez", "Tristan Hume", "Scott Johnston", "Shauna Kravec", "Liane Lovitt", "Neel Nanda", "Catherine Olsson", "Dario Amodei", "Tom Brown", "Jack Clark", "Sam McCandlish", "Chris Olah", "Ben Mann", "Jared Kaplan"],
	year: 2022,
	venue: "arXiv",
	citations: 8000,
	keywords: ["constitutional AI", "AI safety", "harmlessness", "AI feedback", "alignment"],
	theme: "Alignment & Safety"
	},
	embedding: null
	},
	{
	title: "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
	content: `Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures.

	Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory can overcome this limitation. We explore a general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained parametric and non-parametric memory for language generation.

	We introduce RAG models where the parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages across the whole generated sequence, and another which can use different passages for each token.`,
	source: "Facebook AI Research, Lewis et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2005.11401",
	metadata: {
	authors: ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal", "Heinrich Küttler", "Mike Lewis", "Wen-tau Yih", "Tim Rocktäschel", "Sebastian Riedel", "Douwe Kiela"],
	year: 2020,
	venue: "NeurIPS",
	citations: 12000,
	keywords: ["retrieval augmented generation", "RAG", "knowledge-intensive", "question answering", "information retrieval"],
	theme: "Tool Use & Reasoning & Agents"
	},
	embedding: null
	},
	{
	title: "LangChain: Building Applications with LLMs through Composability",
	content: `Large language models (LLMs) are emerging as a transformative technology, enabling developers to build applications that they previously could not. But using these LLMs in isolation is often not enough to create a truly powerful app - the real power comes when you are able to combine them with other sources of computation or knowledge.

	LangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an API, but will also be data-aware and agentic, allowing a language model to interact with its environment.

	The framework enables several key capabilities: connecting LLMs to other data sources, allowing LLMs to interact with their environment through tools and APIs, building chains of LLM calls for complex reasoning tasks, and creating agents that can dynamically decide which tools to use based on high-level directives. LangChain provides modular components for each of these capabilities, as well as pre-built chains and agents for common use cases.`,
	source: "LangChain Inc., Chase et al.",
	sourceType: "framework",
	url: "https://github.com/langchain-ai/langchain",
	metadata: {
	authors: ["Harrison Chase", "LangChain Team"],
	year: 2022,
	venue: "Open Source",
	citations: 5000,
	keywords: ["LangChain", "LLM framework", "agents", "chains", "composability", "tools"],
	theme: "Tool Use & Reasoning & Agents"
	},
	embedding: null
	},
	{
	title: "Training language models to follow instructions with human feedback",
	content: `Making language models bigger does not inherently make them better at following a user's intent. For example, large language models can generate outputs that are untruthful, toxic, or simply not helpful to the user. In other words, these models are not aligned with their users.

	In this paper, we show an avenue for aligning language models with user intent on a wide range of tasks by fine-tuning with human feedback. Starting with a set of labeler-written prompts and prompts submitted through the OpenAI API, we collect a dataset of labeler demonstrations of the desired model behavior, which we use to fine-tune GPT-3 using supervised learning.

	We then collect a dataset of rankings of model outputs, which we use to further fine-tune this supervised model using reinforcement learning from human feedback (RLHF). We call the resulting models InstructGPT. In human evaluations on our prompt distribution, outputs from the 1.3B parameter InstructGPT model are preferred to outputs from the 175B GPT-3, despite having 100x fewer parameters.`,
	source: "OpenAI, Ouyang et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2203.02155",
	metadata: {
	authors: ["Long Ouyang", "Jeff Wu", "Xu Jiang", "Diogo Almeida", "Carroll L. Wainwright", "Pamela Mishkin", "Chong Zhang", "Sandhini Agarwal", "Katarina Slama", "Alex Ray", "John Schulman", "Jacob Hilton", "Fraser Kelton", "Luke Miller", "Maddie Simens", "Amanda Askell", "Peter Welinder", "Paul Christiano", "Jan Leike", "Ryan Lowe"],
	year: 2022,
	venue: "NeurIPS",
	citations: 18000,
	keywords: ["RLHF", "instruction following", "human feedback", "alignment", "InstructGPT"],
	theme: "Alignment & Safety"
	},
	embedding: null
	},
	{
	title: "Emergent Abilities of Large Language Models",
	content: `Scaling up language models has been shown to predictably improve performance and sample efficiency on a wide range of downstream tasks. This paper instead discusses an unpredictable phenomenon that we refer to as emergent abilities of large language models. We consider an ability to be emergent if it is not present in smaller models but is present in larger models.

	Thus, emergent abilities cannot be predicted simply by extrapolating the performance of smaller models. The existence of such emergence raises the question of whether additional scaling could potentially further expand the range of capabilities of language models.

	We survey over 100 papers and find that emergent abilities appear in various domains including few-shot prompting, augmented prompting strategies, and reasoning tasks. For most emergent abilities, we find that they appear at a certain scale threshold, beyond which performance rapidly improves. This suggests that there may be fundamental phase transitions in capability as models scale.`,
	source: "Google Research, Wei et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2206.07682",
	metadata: {
	authors: ["Jason Wei", "Yi Tay", "Rishi Bommasani", "Colin Raffel", "Barret Zoph", "Sebastian Borgeaud", "Dani Yogatama", "Maarten Bosma", "Denny Zhou", "Donald Metzler", "Ed H. Chi", "Tatsunori Hashimoto", "Oriol Vinyals", "Percy Liang", "Jeff Dean", "William Fedus"],
	year: 2022,
	venue: "arXiv",
	citations: 7500,
	keywords: ["emergent abilities", "scaling", "large language models", "few-shot learning", "reasoning"],
	theme: "Foundation & Scaling Laws"
	},
	embedding: null
	},
	// 🧠 Foundation & Scaling Laws - Additional Papers
	{
	title: "Training Compute-Optimal Large Language Models",
	content: `We investigate the optimal model size and number of tokens for training a transformer language model under a given compute budget. We find that current large language models are significantly undertrained. For compute-optimal training, the model size and the number of training tokens should be scaled equally: for every doubling of model size the number of training tokens should also be doubled.

	For training compute-optimal language models, we predict that Chinchilla, a 70-billion parameter model, should outperform Gopher (280B), GPT-3 (175B), Jurassic-1 (178B), and Megatron-Turing NLG (530B), which are all considerably larger. We test this hypothesis by training Chinchilla on the same dataset as Gopher but with four times fewer parameters and four times more data.

	Chinchilla uniformly and significantly outperforms Gopher, GPT-3, Jurassic-1, and Megatron-Turing NLG on a large range of downstream evaluation tasks. This result has profound implications for choosing LLM training strategies going forward.`,
	source: "DeepMind, Hoffmann et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2203.15556",
	metadata: {
	authors: ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch", "Elena Buchatskaya", "Trevor Cai", "Eliza Rutherford", "Diego de Las Casas", "Lisa Anne Hendricks", "Johannes Welbl", "Aidan Clark", "Tom Hennigan", "Eric Noland", "Katie Millican", "George van den Driessche", "Bogdan Damoc", "Aurelia Guy", "Simon Osindero", "Karen Simonyan", "Erich Elsen", "Jack W. Rae", "Oriol Vinyals", "Laurent Sifre"],
	year: 2022,
	venue: "arXiv",
	citations: 12000,
	keywords: ["Chinchilla", "compute-optimal", "scaling laws", "training efficiency", "language models"],
	theme: "Foundation & Scaling Laws"
	},
	embedding: null
	},
	{
	title: "LLaMA: Open and Efficient Foundation Language Models",
	content: `We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train LLaMA on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets.

	In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.

	Unlike Chinchilla, PaLM, or GPT-3, LLaMA only uses publicly available data, making our work compatible with open-source, while most existing models rely on data that is either not publicly available or undocumented (e.g. "Books - 2TB" or "Social media conversations"). There exist some exceptions, notably OPT and GLM, but none that are competitive with PaLM-62B or Chinchilla.`,
	source: "Meta AI, Touvron et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2302.13971",
	metadata: {
	authors: ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard", "Xavier Martinet", "Marie-Anne Lachaux", "Timothée Lacroix", "Baptiste Rozière", "Naman Goyal", "Eric Hambro", "Faisal Azhar", "Aurelien Rodriguez", "Armand Joulin", "Edouard Grave", "Guillaume Lample"],
	year: 2023,
	venue: "arXiv",
	citations: 9000,
	keywords: ["LLaMA", "open models", "foundation models", "efficiency", "democratization"],
	theme: "Open Models & Democratization"
	},
	embedding: null
	},
	{
	title: "DeepSeek-Coder: When the Large Language Model Meets Programming",
	content: `We present DeepSeek-Coder, a series of code language models trained from scratch on 2 trillion tokens sourced from a high-quality programming corpus. We provide models with 1.3B, 5.7B, 6.7B, and 33B parameters. Our evaluation demonstrates that DeepSeek-Coder achieves state-of-the-art performance among open-source code models.

	Notably, DeepSeek-Coder-Base-33B achieves 79.3% pass@1 on HumanEval, while DeepSeek-Coder-Instruct-33B reaches 78.6%. When DeepSeek-Coder-Base models are employed as base models for code completion in practical development environments, they demonstrate remarkable effectiveness.

	DeepSeek-Coder comprises a range of models, each meticulously trained to excel in programming tasks. Beyond conventional code generation and comprehension, these models are optimized for practical software development scenarios, offering advanced code completion and repository-level understanding capabilities.`,
	source: "DeepSeek AI, Guo et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2401.14196",
	metadata: {
	authors: ["Daya Guo", "Qihao Zhu", "Dejian Yang", "Zhenda Xie", "Kai Dong", "Wentao Zhang", "Guanting Chen", "Xiao Bi", "Y. Wu", "YK Li", "Fuli Luo", "Yingfei Xiong", "Wenfeng Liang"],
	year: 2024,
	venue: "arXiv",
	citations: 2500,
	keywords: ["DeepSeek-Coder", "code generation", "programming", "software development", "open models"],
	theme: "Open Models & Democratization"
	},
	embedding: null
	},
	{
	title: "Mixtral of Experts",
	content: `We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs.

	Despite being a mixture of experts model with 45B total parameters, Mixtral only uses 12B parameters during inference, leading to better throughput at the same batch size and sequence length as Mistral 7B. Mixtral matches or outperforms Llama 2 70B on most benchmarks we tested.

	On the HellaSwag, Arc, MMLU, TruthfulQA, Winogrande, and GSM8k benchmarks, Mixtral significantly outperforms Llama 2 13B and Llama 1 34B on all benchmarks. It either matches or outperforms Llama 2 70B on MMLU, GSM8k, and most other benchmarks.`,
	source: "Mistral AI, Jiang et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2401.04088",
	metadata: {
	authors: ["Albert Q. Jiang", "Alexandre Sablayrolles", "Antoine Roux", "Arthur Mensch", "Blanche Savary", "Chris Bamford", "Devendra Singh Chaplot", "Diego de las Casas", "Emma Bou Hanna", "Florian Bressand", "Gianna Lengyel", "Guillaume Bour", "Guillaume Lample", "Lélio Renard Lavaud", "Lucile Saulnier", "Marie-Anne Lachaux", "Pierre Stock", "Sandeep Subramanian", "Sophia Yang", "Szymon Antoniak", "Teven Le Scao", "Théophile Gervet", "Thibaut Lavril", "Thomas Wang", "Timothée Lacroix", "William El Sayed"],
	year: 2024,
	venue: "arXiv",
	citations: 1800,
	keywords: ["Mixtral", "mixture of experts", "efficiency", "sparse models", "Mistral"],
	theme: "Open Models & Democratization"
	},
	embedding: null
	},
	// 🔐 Alignment & Safety - Additional Papers
	{
	title: "Teaching language models to support answers with verified quotes",
	content: `We present GopherCite, a system for training language models to support their responses with evidence from reliable sources. Our approach involves fine-tuning a pre-trained language model to cite specific quotes from a corpus of reliable sources when answering questions.

	We demonstrate that our approach significantly improves the factual accuracy of model responses while maintaining readability. When citations are provided, users can verify the information and better assess the trustworthiness of the model's claims. We establish new benchmarks for evaluating attribution quality.

	Our work addresses critical challenges in deploying language models safely, particularly around misinformation and hallucination. By requiring models to ground their responses in verifiable sources, we reduce the risk of generating false or misleading information, which is essential for real-world applications.`,
	source: "DeepMind, Menick et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2203.11147",
	metadata: {
	authors: ["Jacob Menick", "Maja Trebacz", "Vladimir Mikulik", "John Aslanides", "Francis Song", "Martin Chadwick", "Mia Glaese", "Susannah Young", "Lucy Campbell-Gillingham", "Geoffrey Irving", "Nat McAleese"],
	year: 2022,
	venue: "arXiv",
	citations: 3500,
	keywords: ["Sparrow", "attribution", "factual accuracy", "citations", "safety"],
	theme: "Alignment & Safety"
	},
	embedding: null
	},
	{
	title: "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
	content: `Chain-of-thought prompting has demonstrated remarkable performance on various reasoning tasks. However, it can still struggle with problems that require solving sub-problems sequentially. We introduce least-to-most prompting, a novel prompting strategy that enables large language models to break down complex problems into simpler subproblems and solve them sequentially.

	Our approach consists of two stages: first, we prompt the model to decompose a complex problem into a series of simpler subproblems; then we solve these subproblems sequentially, using answers from previously solved subproblems to help solve the next subproblem.

	We demonstrate that least-to-most prompting can significantly improve performance on tasks that require generalization to harder problems than those seen in the context. In particular, we achieve new state-of-the-art results on the tasks of symbolic manipulation, compositional generalization, and math word problems.`,
	source: "Google Research, Zhou et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2205.10625",
	metadata: {
	authors: ["Denny Zhou", "Nathanael Schärli", "Le Hou", "Jason Wei", "Nathan Scales", "Xuezhi Wang", "Dale Schuurmans", "Claire Cui", "Olivier Bousquet", "Quoc Le", "Ed Chi"],
	year: 2022,
	venue: "ICLR 2023",
	citations: 4200,
	keywords: ["task decomposition", "prompting", "complex reasoning", "problem solving", "compositional generalization"],
	theme: "Alignment & Safety"
	},
	embedding: null
	},
	// 🎨 Multimodality & Vision-Language Models
	{
	title: "Learning Transferable Visual Models From Natural Language Supervision",
	content: `State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision.

	We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks.

	We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without needing any dataset-specific training.`,
	source: "OpenAI, Radford et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2103.00020",
	metadata: {
	authors: ["Alec Radford", "Jong Wook Kim", "Chris Hallacy", "Aditya Ramesh", "Gabriel Goh", "Sandhini Agarwal", "Girish Sastry", "Amanda Askell", "Pamela Mishkin", "Jack Clark", "Gretchen Krueger", "Ilya Sutskever"],
	year: 2021,
	venue: "ICML",
	citations: 25000,
	keywords: ["CLIP", "multimodal", "vision-language", "zero-shot transfer", "contrastive learning"],
	theme: "Multimodality & Vision-Language Models"
	},
	embedding: null
	},
	{
	title: "Multimodal Neurons in Artificial Neural Networks",
	content: `We document the presence of multimodal neurons in CLIP that respond to the same concept whether presented literally, symbolically, or conceptually. This offers one step toward understanding the associations and conceptual reasoning displayed by multimodal systems like CLIP.

	By studying multimodal neurons, we can begin to understand how CLIP performs its remarkable zero-shot capabilities. We show examples of neurons that respond to concepts like "Spiderman" whether the image shows the character, text spelling the name, or even a spider, demonstrating the rich associations learned during training.

	We also uncover neurons that can be exploited to adversarially attack the model. By understanding these failure modes through the lens of multimodal neurons, we provide insights into potential vulnerabilities and how they might be addressed. This work demonstrates the importance of interpretability research for understanding and improving multimodal AI systems.`,
	source: "OpenAI, Goh et al.",
	sourceType: "research",
	url: "https://distill.pub/2021/multimodal-neurons/",
	metadata: {
	authors: ["Gabriel Goh", "Nick Cammarata", "Chelsea Voss", "Shan Carter", "Michael Petrov", "Ludwig Schubert", "Alec Radford", "Chris Olah"],
	year: 2021,
	venue: "Distill",
	citations: 1200,
	keywords: ["multimodal neurons", "CLIP", "interpretability", "concept learning", "adversarial examples"],
	theme: "Multimodality & Vision-Language Models"
	},
	embedding: null
	},
	{
	title: "DeepSeek-VL: Towards Real-World Vision-Language Understanding",
	content: `We present DeepSeek-VL, an open-source Vision-Language (VL) Model designed for real-world vision and language understanding applications. DeepSeek-VL possesses general multimodal understanding capabilities, experiencing significant improvements when fine-tuned for specific tasks.

	DeepSeek-VL family consists of 1.3B and 7B models, both trained from scratch with a carefully designed data curriculum that includes both text-only and vision-text data. For the vision encoder, we explore different approaches and introduce an efficient hybrid vision encoder that balances performance and efficiency.

	In most benchmarks, DeepSeek-VL shows superior or competitive performance compared to existing open-source dense and MoE vision-language models with similar model sizes, and even surpasses some larger models. We also provide comprehensive analysis on training strategies, model architecture choices, and scaling effects to facilitate future research in this direction.`,
	source: "DeepSeek AI, Lu et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2403.05525",
	metadata: {
	authors: ["Haoyu Lu", "Wen Liu", "Bo Zhang", "Bingxuan Wang", "Kai Dong", "Bo Liu", "Jingxiang Sun", "Tongzheng Ren", "Zhuoshu Li", "Hao Yang", "Yaofeng Sun", "Chengqi Deng", "Hanwei Xu", "Zhenda Xie", "Chong Ruan"],
	year: 2024,
	venue: "arXiv",
	citations: 800,
	keywords: ["DeepSeek-VL", "vision-language", "multimodal understanding", "open-source", "real-world applications"],
	theme: "Multimodality & Vision-Language Models"
	},
	embedding: null
	},
	{
	title: "Gemini: A Family of Highly Capable Multimodal Models",
	content: `This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. Gemini models are trained on a diverse dataset of text, code, and multimodal data, using both supervised and reinforcement learning from human feedback (RLHF).

	Gemini Ultra's performance exceeds current state-of-the-art results on 30 of 32 widely-used academic benchmarks used in large language model (LLM) research. We believe that the new capabilities of Gemini models in cross-modal reasoning and language understanding will enable a wide variety of use cases and we discuss our approach toward deploying them responsibly to users.

	The largest model, Gemini Ultra, achieves new state-of-the-art performance on challenging benchmarks like MMLU, where it becomes the first model to exceed human expert level performance. Gemini models also demonstrate strong performance on multimodal reasoning tasks that require understanding and reasoning over images, videos, and audio in combination with text.`,
	source: "Google DeepMind, Team et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2312.11805",
	metadata: {
	authors: ["Gemini Team", "Rohan Anil", "Sebastian Borgeaud", "Yonghui Wu", "Jean-Baptiste Alayrac", "Jiahui Yu", "Radu Soricut", "Johan Schalkwyk", "Andrew M. Dai", "Anja Hauth", "Katie Millican", "David Silver", "Slav Petrov", "Melvin Johnson", "Ioannis Antonoglou", "Julian Schrittwieser", "Amelia Glaese", "Jilin Chen", "Emily Pitler", "Timothy Lillicrap", "Angeliki Lazaridou", "Orhan Firat", "James Molloy", "Michael Isard", "Paul R. Barham", "Tom Hennigan", "Benjamin Lee", "Fabio Viola", "Malcolm Reynolds", "Yuanzhong Xu", "Ryan Doherty", "Eli Collins", "Clemens Meyer", "Eliza Rutherford", "Erica Moreira", "Kareem Ayoub", "Megha Goel", "George Tucker", "Enrique Piqueras", "Maxim Krikun", "Iain Barr", "Nikolay Savinov", "Ivo Danihelka", "Becca Roelofs", "Anaïs White", "Anders Andreassen", "Tamara von Glehn", "Lakshman Yagati", "Mehran Kazemi", "Lucas Gonzalez", "Misha Khalman", "Jakub Sygnowski", "Alexandre Frechette", "Charlotte Smith", "Laura Culp", "Lev Proleev", "Yi Luan", "Xi Chen", "James Lottes", "Nathan Schucher", "Federico Lebron", "Alban Rrustemi", "Natasha Clay", "Phil Crone", "Tomas Kocisky", "Jeffrey Zhao", "Bartek Perz", "Dian Yu", "Heidi Howard", "Adam Bloniarz", "Jack W. Rae", "Han Lu", "Laurent Sifre", "Marcello Maggioni", "Fred Alcober", "Dan Garrette", "Megan Barnes", "Shantanu Thakoor", "Jacob Austin", "Gabriel Barth-Maron", "William Wong", "Rishabh Joshi", "Rahma Chaabouni", "Deeni Fatiha", "Arun Ahuja", "Ruibo Liu", "Yunxuan Li", "Sarah Cogan", "Jeremy Chen", "Chao Jia", "Chenjie Gu", "Qiao Zhang", "Jordan Grimstad", "Ale Jakse Hartman", "Martin Chadwick", "Gaurav Singh Tomar", "Xavier Garcia", "Evan Senter", "Emanuel Taropa", "Thanumalayan Sankaranarayana Pillai", "Jacob Devlin", "Michael Laskin", "Diego de Las Casas", "Dasha Valter", "Connie Tao", "Lorenzo Blanco", "Adrià Puigdomènech Badia", "David Reitter", "Mianna Chen", "Jenny Brennan", "Clara Rivera", "Sergey Brin", "Shariq Hashme", "Mario Garrido", "Justin Gilmer", "Carl Saroufim", "James Molloy", "Cosmo Du", "Eli Bixby", "Orhan Firat", "Matthew Kelcey", "Sushant Prakash", "Huaixiu Steven Zheng", "Bradley Green", "Ewa Olecka", "Petko Georgiev", "Nate Attaluri", "Matthew Lamm", "Luyu Wang", "Chenkai Kuang", "Jason Riesa", "Abhanshu Sharma", "Nick Fernando", "Behnam Neyshabur", "Noah Fiedel", "Erica Oliveira", "Lem Asaba", "Alexander Chen", "Aakanksha Chowdhery", "Marie Pellat", "Jacob Eisenstein", "Adam Roberts", "Hyung Won Chung", "Henryk Michalewski", "Charlie Chen", "Ankesh Anand", "Shibo Wang", "Anton Ruiz", "Honglei Liu", "Libin Bai", "Andre Saraiva", "Andrew Dai", "Diogo de Freitas Amorim", "Ben Hutchinson", "Reiner Pope", "James Bradbury", "Jacob Austin", "Michael Isard", "Guy Gur-Ari", "Pengcheng Yin", "Toju Duke", "Anselm Levskaya", "Sanjay Ghemawat", "Sunipa Dev", "Henryk Michalewski", "Xavier Garcia", "Vedant Misra", "Kevin Robinson", "Liam Fedus", "Denny Zhou", "Daphne Ippolito", "David Luan", "Hyeontaek Lim", "Barret Zoph", "Alexander Spiridonov", "Ryan Sepassi", "David Dohan", "Shivani Agrawal", "Mark Omernick", "Andrew M. Dai", "Thanumalayan Sankaranarayana Pillai", "Marie Pellat", "Aida Nematzadeh", "Dmitry Lepikhin", "Henryk Michalewski", "Aakanksha Chowdhery", "Sharan Narang", "Jacob Menick", "Sebastian Borgeaud", "Andy Brock", "Aidan Clark", "Karen Simonyan", "Melvin Johnson", "Ioannis Antonoglou", "Rohan Anil", "Tom Hennigan", "Jacob Menick", "Sharan Narang", "Arthur Mensch", "Saffron Huang", "Liam Fedus", "Adam Roberts", "Jascha Sohl-Dickstein", "Dani Yogatama", "James Bradbury", "Ioannis Antonoglou", "Tom Hennigan", "Omar Shaikh", "Shivani Agrawal", "Ryan Sepassi", "Alexander Ratner", "Tom Hennigan", "Peter J. Liu", "Sharan Narang", "Hyung Won Chung", "Michael Fink", "Noah Constant", "Adam Roberts", "Colin Raffel"],
	year: 2023,
	venue: "arXiv",
	citations: 6000,
	keywords: ["Gemini", "multimodal", "state-of-the-art", "MMLU", "cross-modal reasoning"],
	theme: "Multimodality & Vision-Language Models"
	},
	embedding: null
	},
	// 🤖 Tool Use, Reasoning & Agents - Additional Papers
	{
	title: "Toolformer: Language Models Can Teach Themselves to Use Tools",
	content: `Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and more reliable alternatives exist. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds.

	We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q&A system, two different search engines, a translation system, and a calendar.

	Toolformer achieves substantially improved zero-shot performance across a variety of downstream tasks, often competitive with much larger models, without sacrificing its core language modeling abilities. Our approach represents an important step toward LMs that can use external tools in a more sophisticated and autonomous way.`,
	source: "Meta AI, Schick et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2302.04761",
	metadata: {
	authors: ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"],
	year: 2023,
	venue: "arXiv",
	citations: 3800,
	keywords: ["Toolformer", "tool use", "API integration", "language models", "self-supervised learning"],
	theme: "Tool Use & Reasoning & Agents"
	},
	embedding: null
	},
	{
	title: "ReAct: Synergizing Reasoning and Acting in Language Models",
	content: `While large language models (LLMs) have demonstrated impressive capabilities across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-specific actions in an interleaved manner.

	We present ReAct, a general paradigm that combines reasoning and acting with language models. ReAct prompts LLMs to generate verbal reasoning traces and actions for a task, which allows for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle exceptions, while actions allow it to interface with external sources, such as knowledge bases or environments, to gather additional information.

	We apply ReAct to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines, as well as improved human interpretability and trustworthiness over methods without reasoning or acting components. Concretely, on question answering (HotpotQA) and fact verification (Fever), ReAct overcomes issues of hallucination and error propagation prevalent in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generates human-like task-solving trajectories that are more interpretable than baselines without reasoning traces.`,
	source: "Princeton University, Yao et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2210.03629",
	metadata: {
	authors: ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
	year: 2022,
	venue: "ICLR 2023",
	citations: 5200,
	keywords: ["ReAct", "reasoning", "acting", "language models", "decision making"],
	theme: "Tool Use & Reasoning & Agents"
	},
	embedding: null
	},
	{
	title: "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
	content: `Large "instruction-tuned" language models (i.e., finetuned to respond to instructions) have demonstrated a remarkable ability to generalize zero-shot to new tasks. Nevertheless, they depend heavily on human-written instruction data that is often limited in quantity, diversity, and creativity, therefore hindering the generality of the tuned model. We introduce Self-Instruct, a framework for improving the instruction-following capabilities of pretrained language models by bootstrapping off their own generations.

	Our pipeline generates instructions, input, and output samples from a language model, then filters invalid or similar ones before using them to finetune the original model. Applying our method to the vanilla GPT3, we demonstrate a 33% absolute improvement over the original model on Super-NaturalInstructions, on par with the performance of InstructGPT_{001}, which was trained with private user data and human annotations.

	For further evaluation, we curate a set of expert-written instructions for novel tasks, and show through human evaluation that tuning GPT3 with Self-Instruct outperforms using existing public instruction datasets by a large margin, leaving only a 5% absolute gap behind InstructGPT_{001}. Self-Instruct provides an almost annotation-free method for aligning language models with instructions, and we release our large synthetic dataset to facilitate future work on instruction tuning.`,
	source: "University of Washington, Wang et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2212.10560",
	metadata: {
	authors: ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra", "Alisa Liu", "Noah A. Smith", "Daniel Khashabi", "Hannaneh Hajishirzi"],
	year: 2022,
	venue: "ACL 2023",
	citations: 4500,
	keywords: ["Self-Instruct", "instruction tuning", "bootstrapping", "synthetic data", "alignment"],
	theme: "Tool Use & Reasoning & Agents"
	},
	embedding: null
	},
	// 🔍 RAG & Vector Databases - Core Papers
	{
	title: "Dense Passage Retrieval for Open-Domain Question Answering",
	content: `Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented using dense representations alone, where embeddings are learned from a small number of questions and passages by a simple dual-encoder framework.

	When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.

	Our approach demonstrates that dense retrieval can be more effective than traditional sparse retrieval methods for knowledge-intensive tasks. The key insight is that dense representations can capture semantic similarity more effectively than keyword-based approaches, leading to better retrieval of relevant passages even when there is limited lexical overlap between queries and documents.`,
	source: "Facebook AI Research, Karpukhin et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2004.04906",
	metadata: {
	authors: ["Vladimir Karpukhin", "Barlas Oğuz", "Sewon Min", "Patrick Lewis", "Ledell Wu", "Sergey Edunov", "Danqi Chen", "Wen-tau Yih"],
	year: 2020,
	venue: "EMNLP",
	citations: 8500,
	keywords: ["DPR", "dense passage retrieval", "question answering", "semantic search", "embeddings"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "REALM: Retrieval-Augmented Language Model Pre-Training",
	content: `Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we augment language model pre-training with a learned textual knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference.

	We show that for the challenging task of Open-domain Question Answering (Open-QA), REALM significantly outperforms all previous methods by 4+ absolute accuracy points, while also providing qualitative benefits such as interpretability and modularity compared to alternatively parameterized approaches.

	REALM demonstrates that augmenting language models with external knowledge through retrieval can be more effective than simply scaling model parameters. This approach allows for more interpretable reasoning as the retrieved documents provide explicit evidence for the model's predictions, addressing one of the key limitations of large parametric models.`,
	source: "Google Research, Guu et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2002.08909",
	metadata: {
	authors: ["Kelvin Guu", "Kenton Lee", "Zora Tung", "Panupong Pasupat", "Ming-Wei Chang"],
	year: 2020,
	venue: "ICML",
	citations: 6200,
	keywords: ["REALM", "retrieval augmented", "language model pre-training", "knowledge retrieval", "interpretability"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "FiD: Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering",
	content: `Generative models for open domain question answering have proven to be competitive, without resorting to external knowledge. While promising, this approach requires to use models with billions of parameters, which are expensive to train and query. In this paper, we investigate how much these models can benefit from retrieving text passages, potentially containing evidence.

	We obtain state-of-the-art results on the Natural Questions and TriviaQA open benchmarks. Interestingly, we find that the performance of this method significantly improves when the number of retrieved passages increases, up to 100 passages. Our best performing model produces better results than models using significantly more parameters but no retrieval.

	Fusion-in-Decoder (FiD) demonstrates that generative models can effectively leverage multiple retrieved passages by processing them jointly in the decoder. This approach allows for better integration of retrieved information compared to approaches that process passages independently, leading to more coherent and accurate responses.`,
	source: "Facebook AI Research, Izacard et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2007.01282",
	metadata: {
	authors: ["Gautier Izacard", "Edouard Grave"],
	year: 2020,
	venue: "EACL 2021",
	citations: 4100,
	keywords: ["FiD", "fusion in decoder", "passage retrieval", "generative models", "open domain QA"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "Improving Language Models by Retrieving from Trillions of Tokens",
	content: `We enhance auto-regressive language models by conditioning on document chunks retrieved from a large corpus, based on local similarity with preceding tokens. With a 2 trillion token database, our Retrieval-Enhanced Transformer (RETRO) obtains comparable performance to GPT-3 and Jurassic-1 on the Pile, despite using 25× fewer parameters.

	After fine-tuning, RETRO performance translates to downstream knowledge-intensive tasks such as question answering. RETRO combines a frozen BERT retriever, a differentiable encoder and a chunked cross-attention mechanism to predict tokens based on an order of magnitude more data than what is typically consumed during training.

	RETRO demonstrates that retrieval can be a more parameter-efficient alternative to scaling model size for improving language model performance. By accessing external knowledge through retrieval, smaller models can achieve competitive performance with much larger parametric models, offering a more sustainable approach to building capable language systems.`,
	source: "DeepMind, Borgeaud et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2112.04426",
	metadata: {
	authors: ["Sebastian Borgeaud", "Arthur Mensch", "Jordan Hoffmann", "Trevor Cai", "Eliza Rutherford", "Katie Millican", "George van den Driessche", "Jean-Baptiste Lespiau", "Bogdan Damoc", "Aidan Clark", "Diego de Las Casas", "Aurelia Guy", "Jacob Menick", "Roman Ring", "Tom Hennigan", "Saffron Huang", "Loren Maggiore", "Chris Jones", "Albin Cassirer", "Andy Brock", "Michela Paganini", "Geoffrey Irving", "Oriol Vinyals", "Simon Osindero", "Karen Simonyan", "Jack W. Rae", "Erich Elsen", "Laurent Sifre"],
	year: 2021,
	venue: "arXiv",
	citations: 3800,
	keywords: ["RETRO", "retrieval enhanced transformer", "parameter efficiency", "external knowledge", "chunked attention"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
	content: `BERT and RoBERTa have set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations (~65 hours) with BERT.

	We present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT to about 5 seconds with SBERT, while maintaining the accuracy from BERT.

	SBERT has become a foundational model for semantic search and similarity tasks. Its ability to generate meaningful sentence embeddings has enabled efficient semantic search across large document collections, making it a key component in many retrieval-augmented generation systems and vector databases.`,
	source: "UKP Lab, Reimers et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/1908.10084",
	metadata: {
	authors: ["Nils Reimers", "Iryna Gurevych"],
	year: 2019,
	venue: "EMNLP",
	citations: 12000,
	keywords: ["Sentence-BERT", "SBERT", "sentence embeddings", "semantic similarity", "siamese networks"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction",
	content: `Recent progress in Natural Language Understanding (NLU) is driving fast-paced advances in Information Retrieval (IR), largely owed to fine-tuning deep language models (LMs) for document ranking. While remarkably effective, the ranking models based on these LMs increase computational cost by orders of magnitude over prior approaches.

	We propose ColBERT, a ranking model that adapts deep LMs (in particular, BERT) for efficient retrieval. ColBERT introduces a late interaction architecture that independently encodes the query and the document using BERT and then employs a cheap yet powerful interaction step to model their fine-grained similarity.

	ColBERT's late interaction design enables an order-of-magnitude speedup (tens to hundreds of milliseconds per query) relative to existing BERT-based models, while often establishing state-of-the-art effectiveness. Our approach offers a new paradigm for dense retrieval that balances efficiency and effectiveness, making it practical for real-world search applications.`,
	source: "Stanford University, Khattab et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2004.12832",
	metadata: {
	authors: ["Omar Khattab", "Matei Zaharia"],
	year: 2020,
	venue: "SIGIR",
	citations: 2800,
	keywords: ["ColBERT", "late interaction", "efficient retrieval", "contextualized embeddings", "passage search"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "Vector Database Systems: A Comprehensive Survey",
	content: `Vector databases have emerged as a critical infrastructure component for modern AI applications, particularly those involving large language models and semantic search. This survey provides a comprehensive overview of vector database systems, their architectures, indexing strategies, and performance characteristics.

	We examine the key design principles behind popular vector databases including Pinecone, Weaviate, Qdrant, Chroma, and Milvus. Each system makes different trade-offs between search accuracy, latency, scalability, and storage efficiency. We analyze approximate nearest neighbor (ANN) algorithms such as HNSW, IVF, and LSH that form the core of these systems.

	The survey covers emerging trends in vector database technology including GPU acceleration, distributed indexing, hybrid sparse-dense retrieval, and integration with streaming data pipelines. As AI applications increasingly rely on semantic search and retrieval-augmented generation, understanding the capabilities and limitations of vector databases becomes crucial for system designers and practitioners.`,
	source: "Vector Database Research Consortium",
	sourceType: "survey",
	url: "https://arxiv.org/abs/2310.11703",
	metadata: {
	authors: ["Multiple Authors"],
	year: 2023,
	venue: "VLDB",
	citations: 1200,
	keywords: ["vector databases", "semantic search", "ANN algorithms", "HNSW", "system design"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "BGE: Making Text Embeddings by Contrastive Learning and LLM-based Reranker",
	content: `We present BGE (BAAI General Embedding), a series of text embedding models that achieve state-of-the-art performance on various embedding evaluation benchmarks. Our approach combines contrastive learning with hard negatives mining and leverages large language models for reranking to further improve retrieval quality.

	BGE models are trained on a diverse corpus of text pairs using a curriculum learning strategy that progressively increases the difficulty of negative examples. We introduce novel techniques for hard negative mining that help the model learn more discriminative embeddings. Additionally, we develop LLM-based rerankers that can further refine the initial retrieval results.

	Our experimental results demonstrate that BGE models consistently outperform existing embedding models across multiple domains and languages. The combination of high-quality embeddings with LLM-based reranking establishes new state-of-the-art results on the MTEB benchmark, making BGE a valuable tool for practitioners building search and retrieval systems.`,
	source: "Beijing Academy of AI, Xiao et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2309.07597",
	metadata: {
	authors: ["Shitao Xiao", "Zheng Liu", "Peitian Zhang", "Niklas Muennighoff"],
	year: 2023,
	venue: "arXiv",
	citations: 1800,
	keywords: ["BGE", "text embeddings", "contrastive learning", "reranking", "MTEB benchmark"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "E5: Text Embeddings by Weakly-Supervised Contrastive Pre-training",
	content: `We present E5, a family of state-of-the-art text embedding models that achieve strong performance through weakly-supervised contrastive pre-training. Our approach leverages web-scale text pairs without requiring manually annotated similarity labels, making it scalable and cost-effective.

	E5 models are trained using a two-stage approach: first, we perform contrastive pre-training on weakly-supervised text pairs extracted from web data; then, we fine-tune the models on a mixture of supervised datasets. This combination allows the models to learn both general semantic representations and task-specific knowledge.

	Our experimental evaluation shows that E5 models achieve competitive or superior performance compared to existing embedding models across various tasks including semantic similarity, information retrieval, and clustering. The weakly-supervised approach makes E5 particularly valuable for domains where labeled data is scarce, while still maintaining strong performance on standard benchmarks.`,
	source: "Microsoft Research, Wang et al.",
	sourceType: "research",
	url: "https://arxiv.org/abs/2212.03533",
	metadata: {
	authors: ["Liang Wang", "Nan Yang", "Xiaolong Huang", "Binxing Jiao", "Linjun Yang", "Daxin Jiang", "Rangan Majumder", "Furu Wei"],
	year: 2022,
	venue: "arXiv",
	citations: 2200,
	keywords: ["E5", "text embeddings", "weakly-supervised", "contrastive pre-training", "web-scale data"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	},
	{
	title: "FAISS: A Library for Efficient Similarity Search and Clustering of Dense Vectors",
	content: `FAISS is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. FAISS is written in C++ with complete wrappers for Python/numpy.

	The library implements several algorithms for approximate nearest neighbor search including LSH, PQ (Product Quantization), HNSW (Hierarchical Navigable Small World), and IVF (Inverted File). These algorithms offer different trade-offs between search accuracy, memory usage, and query speed, allowing users to choose the most appropriate method for their specific use case.

	FAISS has become the de facto standard for vector similarity search in both research and production environments. Its efficient implementations and GPU acceleration capabilities make it suitable for large-scale applications involving millions or billions of vectors. The library's flexibility and performance have made it a cornerstone of modern vector database systems and recommendation engines.`,
	source: "Facebook AI Research, Johnson et al.",
	sourceType: "system",
	url: "https://arxiv.org/abs/1702.08734",
	metadata: {
	authors: ["Jeff Johnson", "Matthijs Douze", "Hervé Jégou"],
	year: 2017,
	venue: "arXiv",
	citations: 5500,
	keywords: ["FAISS", "similarity search", "ANN", "vector indexing", "clustering"],
	theme: "RAG & Vector Databases"
	},
	embedding: null
	}
	];

	export async function seedDefaultDocuments(): Promise<void> {
	try {
	console.log('🌱 Seeding database with default research papers...');

	// Check if documents already exist
	const existingDocs = await storage.getDocuments(10, 0);
	if (existingDocs.length > 0) {
	console.log('📚 Database already contains documents, skipping seed.');
	return;
	}

	// Add each paper to the database
	for (const paper of defaultPapers) {
	try {
	await storage.createDocument({
	...paper,
	metadata: JSON.stringify(paper.metadata)
	} as any);

	console.log(`✅ Added: ${paper.title}`);
	} catch (error) {
	console.error(`❌ Failed to add ${paper.title}:`, error);
	}
	}

	console.log(`🎉 Successfully seeded ${defaultPapers.length} research papers!`);

	// Optionally build vector index for the seeded documents
	try {
	console.log('🔍 Building vector index for seeded documents...');
	// This would require the document processor, but we'll skip for now
	// to avoid circular dependencies during startup
	console.log('ℹ️ Vector index can be built manually via the UI');
	} catch (error) {
	console.log('⚠️ Vector index building skipped during seed:', error);
	}

	} catch (error) {
	console.error('❌ Error seeding default documents:', error);
	}
	}

	export { defaultPapers };