From a9b21c47d71ecf3cf7f930cd70a5dcdd6d340da5 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Mon, 4 Nov 2024 05:27:52 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 23309 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 23704 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..0e219175 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-11-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2403.04182v3","updated":"2024-11-01T17:57:01Z","published":"2024-03-07T03:24:34Z","title":"Regression-aware Inference with LLMs","summary":" Large language models (LLMs) have shown strong results on a range of\napplications, including regression and scoring tasks. Typically, one obtains\noutputs from an LLM via autoregressive sampling from the model's output\ndistribution. We show that this inference strategy can be sub-optimal for\ncommon regression and scoring evaluation metrics. As a remedy, we build on\nprior work on Minimum Bayes Risk decoding, and propose alternate inference\nstrategies that estimate the Bayes-optimal solution for regression and scoring\nmetrics in closed-form from sampled responses. We show that our proposal\nsignificantly improves over baselines across datasets and models.\n","authors":["Michal Lukasik","Harikrishna Narasimhan","Aditya Krishna Menon","Felix Yu","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2403.04182v3.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2407.12749v2","updated":"2024-11-01T17:31:11Z","published":"2024-07-17T17:11:13Z","title":"HDLCopilot: Natural Language Exploration of Hardware Designs and\n Libraries","summary":" Hardware design workflows often involve working with Process Design Kits\n(PDKs) from various fabrication labs, each containing its own set of standard\ncell libraries optimized for metrics such as speed, power, or density. These\nlibraries include multiple views for information on timing and electrical\nproperties of cells, cell layout details, and process design rules. Engineers\ntypically navigate between the design and the target technology to make\ninformed decisions on different design scenarios, such as selecting specific\ngates for area optimization or enhancing critical path speed. Navigating this\ncomplex landscape to retrieve specific information about gates or design rules\nis often time-consuming and error-prone. To address this, we present\nHDLCopilot, a multi-agent collaborative framework powered by large language\nmodels that enables engineers to streamline interactions with hardware design\nand PDKs through natural language queries. HDLCopilot enables engineers to\nquickly access relevant information on gates and design rules, evaluate\ntradeoffs related to area, speed, and power in order to make informed decisions\nefficiently and accurately. The framework achieves an execution accuracy of\n96.33\\% on a diverse set of complex natural language queries. HDLCopilot\npositions itself as a powerful assistant in hardware design workflows,\nenhancing productivity and reducing potential human errors.\n","authors":["Manar Abdelatty","Jacob Rosenstein","Sherief Reda"],"pdf_url":"https://arxiv.org/pdf/2407.12749v2.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.01721v3","updated":"2024-11-01T17:12:53Z","published":"2024-06-03T18:27:44Z","title":"DuQuant: Distributing Outliers via Dual Transformation Makes Stronger\n Quantized LLMs","summary":" Quantization of large language models (LLMs) faces significant challenges,\nparticularly due to the presence of outlier activations that impede efficient\nlow-bit representation. Traditional approaches predominantly address Normal\nOutliers, which are activations across all tokens with relatively large\nmagnitudes. However, these methods struggle with smoothing Massive Outliers\nthat display significantly larger values, which leads to significant\nperformance degradation in low-bit quantization. In this paper, we introduce\nDuQuant, a novel approach that utilizes rotation and permutation\ntransformations to more effectively mitigate both massive and normal outliers.\nFirst, DuQuant starts by constructing the rotation matrix, using specific\noutlier dimensions as prior knowledge, to redistribute outliers to adjacent\nchannels by block-wise rotation. Second, We further employ a zigzag permutation\nto balance the distribution of outliers across blocks, thereby reducing\nblock-wise variance. A subsequent rotation further smooths the activation\nlandscape, enhancing model performance. DuQuant simplifies the quantization\nprocess and excels in managing outliers, outperforming the state-of-the-art\nbaselines across various sizes and types of LLMs on multiple tasks, even with\n4-bit weight-activation quantization. Our code is available at\nhttps://github.com/Hsu1023/DuQuant.\n","authors":["Haokun Lin","Haobo Xu","Yichen Wu","Jingzhi Cui","Yingtao Zhang","Linzhan Mou","Linqi Song","Zhenan Sun","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2406.01721v3.pdf","comment":"NeurIPS 2024 Oral, Website at https://duquant.github.io"},{"id":"http://arxiv.org/abs/2410.19499v2","updated":"2024-11-01T16:45:29Z","published":"2024-10-25T11:58:12Z","title":"Introducing MAPO: Momentum-Aided Gradient Descent Prompt Optimization","summary":" Momentum-Aided Prompt Optimization (MAPO) enhances the efficiency and\nefficacy of prompt optimization for Large Language Models (LLMs). Building on\nProTeGi, MAPO uses positive natural language \"gradients\" and a momentum-based\nextension to refine prompts effectively. By tracking gradient history, MAPO\navoids local minima and oscillations. It also utilizes beam search and an Upper\nConfidence Bound (UCB) algorithm for balanced candidate expansion and\nselection. Benchmark testing shows that MAPO achieves faster convergence time\nwith fewer API calls and higher F1 scores than ProTeGi, proving it as a robust\nand scalable solution for automated prompt engineering in LLMs.\n","authors":["Anthony Cui","Pranav Nandyalam","Ethan Cheung","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.19499v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04559v4","updated":"2024-11-01T16:10:41Z","published":"2024-02-07T03:37:19Z","title":"Can Large Language Model Agents Simulate Human Trust Behavior?","summary":" Large Language Model (LLM) agents have been increasingly adopted as\nsimulation tools to model humans in social science and role-playing\napplications. However, one fundamental question remains: can LLM agents really\nsimulate human behavior? In this paper, we focus on one critical and elemental\nbehavior in human interactions, trust, and investigate whether LLM agents can\nsimulate human trust behavior. We first find that LLM agents generally exhibit\ntrust behavior, referred to as agent trust, under the framework of Trust Games,\nwhich are widely recognized in behavioral economics. Then, we discover that\nGPT-4 agents manifest high behavioral alignment with humans in terms of trust\nbehavior, indicating the feasibility of simulating human trust behavior with\nLLM agents. In addition, we probe the biases of agent trust and differences in\nagent trust towards other LLM agents and humans. We also explore the intrinsic\nproperties of agent trust under conditions including external manipulations and\nadvanced reasoning strategies. Our study provides new insights into the\nbehaviors of LLM agents and the fundamental analogy between LLMs and humans\nbeyond value alignment. We further illustrate broader implications of our\ndiscoveries for applications where trust is paramount.\n","authors":["Chengxing Xie","Canyu Chen","Feiran Jia","Ziyu Ye","Shiyang Lai","Kai Shu","Jindong Gu","Adel Bibi","Ziniu Hu","David Jurgens","James Evans","Philip Torr","Bernard Ghanem","Guohao Li"],"pdf_url":"https://arxiv.org/pdf/2402.04559v4.pdf","comment":"Accepted to Proceedings of NeurIPS 2024. The first two authors\n contributed equally. 10 pages for main paper, 56 pages including appendix.\n Project website: https://agent-trust.camel-ai.org"},{"id":"http://arxiv.org/abs/2410.24198v2","updated":"2024-11-01T16:06:10Z","published":"2024-10-31T17:55:13Z","title":"SelfCodeAlign: Self-Alignment for Code Generation","summary":" Instruction tuning is a supervised fine-tuning approach that significantly\nimproves the ability of large language models (LLMs) to follow human\ninstructions. We propose SelfCodeAlign, the first fully transparent and\npermissive pipeline for self-aligning code LLMs without extensive human\nannotations or distillation. SelfCodeAlign employs the same base model for\ninference throughout the data generation process. It first extracts diverse\ncoding concepts from high-quality seed snippets to generate new tasks. It then\nsamples multiple responses per task, pairs each with test cases, and validates\nthem in a sandbox environment. Finally, passing examples are selected for\ninstruction tuning. In our primary experiments, we use SelfCodeAlign with\nCodeQwen1.5-7B to generate a dataset of 74k instruction-response pairs.\nFinetuning on this dataset leads to a model that achieves a 67.1 pass@1 on\nHumanEval+, surpassing CodeLlama-70B-Instruct despite being ten times smaller.\nAcross all benchmarks, this finetuned model consistently outperforms the\noriginal version trained with OctoPack, the previous state-of-the-art method\nfor instruction tuning without human annotations or distillation. Additionally,\nwe show that SelfCodeAlign is effective across LLMs of various sizes, from 3B\nto 33B, and that the base models can benefit more from alignment with their own\ndata distribution. We further validate each component's effectiveness in our\npipeline, showing that SelfCodeAlign outperforms both direct distillation from\nGPT-4o and leading GPT-3.5-based distillation methods, such as OSS-Instruct and\nEvol-Instruct. SelfCodeAlign has also led to the creation of\nStarCoder2-Instruct, the first fully transparent, permissively licensed, and\nself-aligned code LLM that achieves state-of-the-art coding performance.\n","authors":["Yuxiang Wei","Federico Cassano","Jiawei Liu","Yifeng Ding","Naman Jain","Zachary Mueller","Harm de Vries","Leandro von Werra","Arjun Guha","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24198v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.08964v2","updated":"2024-11-01T15:53:08Z","published":"2024-10-11T16:32:05Z","title":"Language Imbalance Driven Rewarding for Multilingual Self-improving","summary":" Large Language Models (LLMs) have achieved state-of-the-art performance\nacross numerous tasks. However, these advancements have predominantly benefited\n\"first-class\" languages such as English and Chinese, leaving many other\nlanguages underrepresented. This imbalance, while limiting broader\napplications, generates a natural preference ranking between languages,\noffering an opportunity to bootstrap the multilingual capabilities of LLM in a\nself-improving manner. Thus, we propose $\\textit{Language Imbalance Driven\nRewarding}$, where the inherent imbalance between dominant and non-dominant\nlanguages within LLMs is leveraged as a reward signal. Iterative DPO training\ndemonstrates that this approach not only enhances LLM performance in\nnon-dominant languages but also improves the dominant language's capacity,\nthereby yielding an iterative reward signal. Fine-tuning\nMeta-Llama-3-8B-Instruct over two iterations of this approach results in\ncontinuous improvements in multilingual performance across\ninstruction-following and arithmetic reasoning tasks, evidenced by an average\nimprovement of 7.46% win rate on the X-AlpacaEval leaderboard and 13.9%\naccuracy on the MGSM benchmark. This work serves as an initial exploration,\npaving the way for multilingual self-improvement of LLMs.\n","authors":["Wen Yang","Junhong Wu","Chen Wang","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.08964v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.18050v2","updated":"2024-11-01T15:36:59Z","published":"2024-10-23T17:24:58Z","title":"LongRAG: A Dual-Perspective Retrieval-Augmented Generation Paradigm for\n Long-Context Question Answering","summary":" Long-Context Question Answering (LCQA), a challenging task, aims to reason\nover long-context documents to yield accurate answers to questions. Existing\nlong-context Large Language Models (LLMs) for LCQA often struggle with the\n\"lost in the middle\" issue. Retrieval-Augmented Generation (RAG) mitigates this\nissue by providing external factual evidence. However, its chunking strategy\ndisrupts the global long-context information, and its low-quality retrieval in\nlong contexts hinders LLMs from identifying effective factual details due to\nsubstantial noise. To this end, we propose LongRAG, a general,\ndual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance\nRAG's understanding of complex long-context knowledge (i.e., global information\nand factual details). We design LongRAG as a plug-and-play paradigm,\nfacilitating adaptation to various domains and LLMs. Extensive experiments on\nthree multi-hop datasets demonstrate that LongRAG significantly outperforms\nlong-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG\n(up by 17.25%). Furthermore, we conduct quantitative ablation studies and\nmulti-dimensional analyses, highlighting the effectiveness of the system's\ncomponents and fine-tuning strategies. Data and code are available at\nhttps://github.com/QingFei1/LongRAG.\n","authors":["Qingfei Zhao","Ruobing Wang","Yukuo Cen","Daren Zha","Shicheng Tan","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2410.18050v2.pdf","comment":"EMNLP 2024 Main, Final"},{"id":"http://arxiv.org/abs/2404.15420v3","updated":"2024-11-01T14:56:52Z","published":"2024-04-23T18:10:42Z","title":"XC-Cache: Cross-Attending to Cached Context for Efficient LLM Inference","summary":" In-context learning (ICL) approaches typically leverage prompting to\ncondition decoder-only language model generation on reference information.\nJust-in-time processing of a context is inefficient due to the quadratic cost\nof self-attention operations, and caching is desirable. However, caching\ntransformer states can easily require almost as much space as the model\nparameters. When the right context isn't known in advance, caching ICL can be\nchallenging. This work addresses these limitations by introducing models that,\ninspired by the encoder-decoder architecture, use cross-attention to condition\ngeneration on reference text without the prompt. More precisely, we leverage\npre-trained decoder-only models and only train a small number of added layers.\nWe use Question-Answering (QA) as a testbed to evaluate the ability of our\nmodels to perform conditional generation and observe that they outperform ICL,\nare comparable to fine-tuned prompted LLMs, and drastically reduce the space\nfootprint relative to standard KV caching by two orders of magnitude.\n","authors":["João Monteiro","Étienne Marcotte","Pierre-André Noël","Valentina Zantedeschi","David Vázquez","Nicolas Chapados","Christopher Pal","Perouz Taslakian"],"pdf_url":"https://arxiv.org/pdf/2404.15420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19381v3","updated":"2024-11-01T14:51:38Z","published":"2024-09-28T15:12:55Z","title":"INC-Math: Integrating Natural Language and Code for Enhanced\n Mathematical Reasoning in Large Language Models","summary":" Large Language Models (LLMs) are commonly used to generate solutions for\nmathematical reasoning problems in the following formats: natural language,\ncode, or a combination of both. In this paper, we explore fundamental questions\nrelated to solving mathematical reasoning problems using natural language and\ncode with state-of-the-art LLMs, including GPT-4o-mini and LLama-3.1-8b-Turbo.\nOur findings show that LLMs are better at reasoning in natural language\ncompared to code. Additionally, although natural language and code serve as\ncomplementary forms of reasoning, they can affect each other in a negative way\nin certain scenarios. These insights motivate our development of a new\nprompting method, INC-Math, which leverages an LLM to dynamically select the\nmost appropriate reasoning form, resulting in improved performance over\ncomparable baselines with GPT-4o-mini.\n","authors":["Xuyuan Xiong","Simeng Han","Ziyue Zhou","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2409.19381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14940v3","updated":"2024-11-01T14:49:44Z","published":"2024-10-19T02:07:33Z","title":"Nova: A Practical and Advanced Alignment","summary":" We introduce Nova, a suite of practical alignment techniques employed in a\nseries of empirically validated high-performing models. This represents the\nfirst comprehensive account of alignment methodologies, offering valuable\ninsights for advancing AI research. We investigate the critical components that\nenhance model performance during the alignment process, including optimization\nmethods, data strategies, capability enhancements, and evaluation processes.\nThe process spans three key stages: Prompt Augmentation System(PAS), Supervised\nFine-Tuning(SFT), and Preference Alignment. The problems encountered, the\nsolutions applied, and the improvements made are thoroughly recorded.\n Through comparisons across well-established benchmarks, we highlight the\ntechnological advancements enabled by Nova Alignment. Importantly,\nQwen2-Nova-72B and Llama3-PBM-Nova-70B are instruct versions of the Qwen2-72B\nand Llama-3-70B base models, optimized through Nova. The Nova models show\nsignificant core improvements, with user experience gains of 17% to 28%, and\nexcels on specialized benchmarks. In open-source benchmark evaluations, both\nQwen2-Nova-72B and Llama3-PBM-Nova-70B consistently outperform their respective\nofficial instruct versions across nearly all datasets. This report aims to\nclarify the key technologies behind the alignment process, fostering a deeper\nunderstanding within the community. Llama3-PBM-Nova-70B model is available at\nhttps://huggingface.co/PKU-Baichuan-MLSystemLab/Llama3-PBM-Nova-70B.\n","authors":["Mingan Lin","Fan Yang","Yanjun Shen","Haoze Sun","Tianpeng Li","Tao Zhang","Chenzheng Zhu","Tao Zhang","Miao Zheng","Xu Li","Yijie Zhou","Mingyang Chen","Yanzhao Qin","Youquan Li","Hao Liang","Fei Li","Yadong Li","Mang Wang","Guosheng Dong","Kun Fang","Jianhua Xu","Bin Cui","Wentao Zhang","Zenan Zhou","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2410.14940v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18760v4","updated":"2024-11-01T14:37:37Z","published":"2023-11-30T18:02:44Z","title":"TaskBench: Benchmarking Large Language Models for Task Automation","summary":" In recent years, the remarkable progress of large language models (LLMs) has\nsparked interest in task automation, which involves decomposing complex tasks\ndescribed by user instructions into sub-tasks and invoking external tools to\nexecute them, playing a central role in autonomous agents. However, there is a\nlack of systematic and standardized benchmarks to promote the development of\nLLMs in task automation. To address this, we introduce TaskBench, a\ncomprehensive framework to evaluate the capability of LLMs in task automation.\nSpecifically, task automation can be divided into three critical stages: task\ndecomposition, tool selection, and parameter prediction. To tackle the\ncomplexities inherent in these stages, we introduce the concept of Tool Graph\nto represent decomposed tasks and adopt a back-instruct method to generate\nhigh-quality user instructions. We propose TaskEval, a multi-faceted evaluation\nmethodology that assesses LLM performance across these three stages. Our\napproach combines automated construction with rigorous human verification,\nensuring high consistency with human evaluation. Experimental results\ndemonstrate that TaskBench effectively reflects the capabilities of various\nLLMs in task automation. It provides insights into model performance across\ndifferent task complexities and domains, pushing the boundaries of what current\nmodels can achieve. TaskBench offers a scalable, adaptable, and reliable\nbenchmark for advancing LLM-based autonomous agents.\n","authors":["Yongliang Shen","Kaitao Song","Xu Tan","Wenqi Zhang","Kan Ren","Siyu Yuan","Weiming Lu","Dongsheng Li","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.18760v4.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23528v2","updated":"2024-11-01T14:27:42Z","published":"2024-10-31T00:29:52Z","title":"Large Language Models for Patient Comments Multi-Label Classification","summary":" Patient experience and care quality are crucial for a hospital's\nsustainability and reputation. The analysis of patient feedback offers valuable\ninsight into patient satisfaction and outcomes. However, the unstructured\nnature of these comments poses challenges for traditional machine learning\nmethods following a supervised learning paradigm. This is due to the\nunavailability of labeled data and the nuances these texts encompass. This\nresearch explores leveraging Large Language Models (LLMs) in conducting\nMulti-label Text Classification (MLTC) of inpatient comments shared after a\nstay in the hospital. GPT-4 Turbo was leveraged to conduct the classification.\nHowever, given the sensitive nature of patients' comments, a security layer is\nintroduced before feeding the data to the LLM through a Protected Health\nInformation (PHI) detection framework, which ensures patients'\nde-identification. Additionally, using the prompt engineering framework,\nzero-shot learning, in-context learning, and chain-of-thought prompting were\nexperimented with. Results demonstrate that GPT-4 Turbo, whether following a\nzero-shot or few-shot setting, outperforms traditional methods and Pre-trained\nLanguage Models (PLMs) and achieves the highest overall performance with an\nF1-score of 76.12% and a weighted F1-score of 73.61% followed closely by the\nfew-shot learning results. Subsequently, the results' association with other\npatient experience structured variables (e.g., rating) was conducted. The study\nenhances MLTC through the application of LLMs, offering healthcare\npractitioners an efficient method to gain deeper insights into patient feedback\nand deliver prompt, appropriate responses.\n","authors":["Hajar Sakai","Sarah S. Lam","Mohammadsadegh Mikaeili","Joshua Bosire","Franziska Jovin"],"pdf_url":"https://arxiv.org/pdf/2410.23528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10691v2","updated":"2024-11-01T14:08:31Z","published":"2024-07-15T13:04:09Z","title":"$\\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific\n Domain through Complementary Granularity","summary":" Recent studies show the growing significance of document retrieval in the\ngeneration of LLMs, i.e., RAG, within the scientific domain by bridging their\nknowledge gap. However, dense retrievers often struggle with domain-specific\nretrieval and complex query-document relationships, particularly when query\nsegments correspond to various parts of a document. To alleviate such prevalent\nchallenges, this paper introduces $\\texttt{MixGR}$, which improves dense\nretrievers' awareness of query-document matching across various levels of\ngranularity in queries and documents using a zero-shot approach.\n$\\texttt{MixGR}$ fuses various metrics based on these granularities to a united\nscore that reflects a comprehensive query-document similarity. Our experiments\ndemonstrate that $\\texttt{MixGR}$ outperforms previous document retrieval by\n24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based\nretrievers, respectively, averaged on queries containing multiple subqueries\nfrom five scientific retrieval datasets. Moreover, the efficacy of two\ndownstream scientific question-answering tasks highlights the advantage of\n$\\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The\ncode and experimental datasets are available.\n","authors":["Fengyu Cai","Xinran Zhao","Tong Chen","Sihao Chen","Hongming Zhang","Iryna Gurevych","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2407.10691v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2406.00976v2","updated":"2024-11-01T13:54:48Z","published":"2024-06-03T04:16:30Z","title":"Generative Pre-trained Speech Language Model with Efficient Hierarchical\n Transformer","summary":" While recent advancements in speech language models have achieved significant\nprogress, they face remarkable challenges in modeling the long acoustic\nsequences of neural audio codecs. In this paper, we introduce\n\\textbf{G}enerative \\textbf{P}re-trained \\textbf{S}peech \\textbf{T}ransformer\n(GPST), a hierarchical transformer designed for efficient speech language\nmodeling. GPST quantizes audio waveforms into two distinct types of discrete\nspeech representations and integrates them within a hierarchical transformer\narchitecture, allowing for a unified one-stage generation process and enhancing\nHi-Res audio generation capabilities. By training on large corpora of speeches\nin an end-to-end unsupervised manner, GPST can generate syntactically\nconsistent speech with diverse speaker identities. Given a brief 3-second\nprompt, GPST can produce natural and coherent personalized speech,\ndemonstrating in-context learning abilities. Moreover, our approach can be\neasily extended to spoken cross-lingual speech generation by incorporating\nmulti-lingual semantic tokens and universal acoustic tokens. Experimental\nresults indicate that GPST significantly outperforms the existing speech\nlanguage models in terms of word error rate, speech quality, and speaker\nsimilarity. The code is available at \\url{https://github.com/youngsheen/GPST}.\n","authors":["Yongxin Zhu","Dan Su","Liqiang He","Linli Xu","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2406.00976v2.pdf","comment":"Accept in ACL2024-main"},{"id":"http://arxiv.org/abs/2405.13845v3","updated":"2024-11-01T13:25:52Z","published":"2024-05-22T17:13:49Z","title":"Semantic Density: Uncertainty Quantification for Large Language Models\n through Confidence Measurement in Semantic Space","summary":" With the widespread application of Large Language Models (LLMs) to various\ndomains, concerns regarding the trustworthiness of LLMs in safety-critical\nscenarios have been raised, due to their unpredictable tendency to hallucinate\nand generate misinformation. Existing LLMs do not have an inherent\nfunctionality to provide the users with an uncertainty/confidence metric for\neach response it generates, making it difficult to evaluate trustworthiness.\nAlthough several studies aim to develop uncertainty quantification methods for\nLLMs, they have fundamental limitations, such as being restricted to\nclassification tasks, requiring additional training and data, considering only\nlexical instead of semantic information, and being prompt-wise but not\nresponse-wise. A new framework is proposed in this paper to address these\nissues. Semantic density extracts uncertainty/confidence information for each\nresponse from a probability distribution perspective in semantic space. It has\nno restriction on task types and is \"off-the-shelf\" for new models and tasks.\nExperiments on seven state-of-the-art LLMs, including the latest Llama 3 and\nMixtral-8x22B models, on four free-form question-answering benchmarks\ndemonstrate the superior performance and robustness of semantic density\ncompared to prior approaches.\n","authors":["Xin Qiu","Risto Miikkulainen"],"pdf_url":"https://arxiv.org/pdf/2405.13845v3.pdf","comment":"Accepted to Neurips 2024"},{"id":"http://arxiv.org/abs/2312.05061v4","updated":"2024-11-01T12:40:45Z","published":"2023-12-08T14:30:08Z","title":"LaCour!: Enabling Research on Argumentation in Hearings of the European\n Court of Human Rights","summary":" Why does an argument end up in the final court decision? Was it deliberated\nor questioned during the oral hearings? Was there something in the hearings\nthat triggered a particular judge to write a dissenting opinion? Despite the\navailability of the final judgments of the European Court of Human Rights\n(ECHR), none of these legal research questions can currently be answered as the\nECHR's multilingual oral hearings are not transcribed, structured, or\nspeaker-attributed. We address this fundamental gap by presenting LaCour!, the\nfirst corpus of textual oral arguments of the ECHR, consisting of 154 full\nhearings (2.1 million tokens from over 267 hours of video footage) in English,\nFrench, and other court languages, each linked to the corresponding final\njudgment documents. In addition to the transcribed and partially manually\ncorrected text from the video, we provide sentence-level timestamps and\nmanually annotated role and language labels. We also showcase LaCour! in a set\nof preliminary experiments that explore the interplay between questions and\ndissenting opinions. Apart from the use cases in legal NLP, we hope that law\nstudents or other interested parties will also use LaCour! as a learning\nresource, as it is freely available in various formats at\nhttps://huggingface.co/datasets/TrustHLT/LaCour.\n","authors":["Lena Held","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2312.05061v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22932v2","updated":"2024-11-01T12:37:10Z","published":"2024-10-30T11:38:13Z","title":"Multi-Agent Large Language Models for Conversational Task-Solving","summary":" In an era where single large language models have dominated the landscape of\nartificial intelligence for years, multi-agent systems arise as new\nprotagonists in conversational task-solving. While previous studies have\nshowcased their potential in reasoning tasks and creative endeavors, an\nanalysis of their limitations concerning the conversational paradigms and the\nimpact of individual agents is missing. It remains unascertained how\nmulti-agent discussions perform across tasks of varying complexity and how the\nstructure of these conversations influences the process. To fill that gap, this\nwork systematically evaluates multi-agent systems across various discussion\nparadigms, assessing their strengths and weaknesses in both generative tasks\nand question-answering tasks. Alongside the experiments, I propose a taxonomy\nof 20 multi-agent research studies from 2022 to 2024, followed by the\nintroduction of a framework for deploying multi-agent LLMs in conversational\ntask-solving. I demonstrate that while multi-agent systems excel in complex\nreasoning tasks, outperforming a single model by leveraging expert personas,\nthey fail on basic tasks. Concretely, I identify three challenges that arise:\n1) While longer discussions enhance reasoning, agents fail to maintain\nconformity to strict task requirements, which leads to problem drift, making\nshorter conversations more effective for basic tasks. 2) Prolonged discussions\nrisk alignment collapse, raising new safety concerns for these systems. 3) I\nshowcase discussion monopolization through long generations, posing the problem\nof fairness in decision-making for tasks like summarization. This work uncovers\nboth the potential and challenges that arise with multi-agent interaction and\nvarying conversational paradigms, providing insights into how future research\ncould improve the efficiency, performance, and safety of multi-agent LLMs.\n","authors":["Jonas Becker"],"pdf_url":"https://arxiv.org/pdf/2410.22932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10188v5","updated":"2024-11-01T10:57:37Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models,\nespecially for long video understanding. We introduce LongVILA, a full-stack\nsolution for long-context visual-language models by co-designing the algorithm\nand system. For model training, we upgrade existing VLMs to support long video\nunderstanding by incorporating two additional stages, i.e., long context\nextension and long video supervised fine-tuning. However, training on long\nvideo is computationally and memory intensive. We introduce the long-context\nMulti-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes\nlong video training and inference, enabling 2M context length training on 256\nGPUs without any gradient checkpointing. LongVILA efficiently extends the\nnumber of video frames of VILA from 8 to 2048, improving the long video\ncaptioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in\n6,000-frame (more than 1 million tokens) video needle-in-a-haystack.\nLongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8%\nwith subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence\nparallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and\ntensor parallelism. Moreover, it seamlessly integrates with Hugging Face\nTransformers.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v5.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2311.08385v4","updated":"2024-11-01T10:28:12Z","published":"2023-11-14T18:48:27Z","title":"Aligning Large Language Models with Human Opinions through Persona\n Selection and Value--Belief--Norm Reasoning","summary":" Reasoning and predicting human opinions with large language models (LLMs) is\nessential yet challenging. Current methods employ role-playing with personae\nbut face two major issues: LLMs are sensitive to even a single irrelevant\npersona, skewing predictions by up to 30%, and LLMs fail to reason\nstrategically over personae. We propose Chain-of-Opinion (COO), a simple\nfour-step solution modeling which and how to reason with personae, inspired by\nthe Value--Belief--Norm (VBN) theory. COO differentiates between explicit\npersonae (demographics and ideology) and implicit personae (historical\nopinions), involves: (1) filtering irrelevant attributes from explicit\npersonae, (2) ranking implicit personae into a preferential list for selecting\ntop-k, (3) applying novel VBN reasoning to extract user environmental and\npersonal value, belief, and norm variables for accurate and reliable\npredictions, and (4) iterating VBN reasoning with progressively larger lists of\nimplicit personae to handle potential persona insufficiency. COO efficiently\nachieves new state-of-the-art opinion prediction via prompting with only 5\ninference calls, improving prior techniques by up to 4%. Notably, fine-tuning\nLMs with COO data results in significantly better opinion-aligned models, by up\nto 23%.\n","authors":["Do Xuan Long","Kenji Kawaguchi","Min-Yen Kan","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.08385v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.11459v3","updated":"2024-11-01T09:55:48Z","published":"2024-05-19T06:00:36Z","title":"Du-IN: Discrete units-guided mask modeling for decoding speech from\n Intracranial Neural signals","summary":" Invasive brain-computer interfaces with Electrocorticography (ECoG) have\nshown promise for high-performance speech decoding in medical applications, but\nless damaging methods like intracranial stereo-electroencephalography (sEEG)\nremain underexplored. With rapid advances in representation learning,\nleveraging abundant recordings to enhance speech decoding is increasingly\nattractive. However, popular methods often pre-train temporal models based on\nbrain-level tokens, overlooking that brain activities in different regions are\nhighly desynchronized during tasks. Alternatively, they pre-train\nspatial-temporal models based on channel-level tokens but fail to evaluate them\non challenging tasks like speech decoding, which requires intricate processing\nin specific language-related areas. To address this issue, we collected a\nwell-annotated Chinese word-reading sEEG dataset targeting language-related\nbrain networks from 12 subjects. Using this benchmark, we developed the Du-IN\nmodel, which extracts contextual embeddings based on region-level tokens\nthrough discrete codex-guided mask modeling. Our model achieves\nstate-of-the-art performance on the 61-word classification task, surpassing all\nbaselines. Model comparisons and ablation studies reveal that our design\nchoices, including (i) temporal modeling based on region-level tokens by\nutilizing 1D depthwise convolution to fuse channels in the ventral sensorimotor\ncortex (vSMC) and superior temporal gyrus (STG) and (ii) self-supervision\nthrough discrete codex-guided mask modeling, significantly contribute to this\nperformance. Overall, our approach -- inspired by neuroscience findings and\ncapitalizing on region-level representations from specific brain regions -- is\nsuitable for invasive brain modeling and represents a promising neuro-inspired\nAI approach in brain-computer interfaces.\n","authors":["Hui Zheng","Hai-Teng Wang","Wei-Bang Jiang","Zhong-Tao Chen","Li He","Pei-Yang Lin","Peng-Hu Wei","Guo-Guang Zhao","Yun-Zhe Liu"],"pdf_url":"https://arxiv.org/pdf/2405.11459v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14716v3","updated":"2024-11-01T09:38:59Z","published":"2024-10-11T13:17:19Z","title":"A Systematic Survey on Large Language Models for Algorithm Design","summary":" Algorithm Design (AD) is crucial for effective problem-solving across various\ndomains. The advent of Large Language Models (LLMs) has notably enhanced the\nautomation and innovation within this field, offering new perspectives and\npromising solutions. Over the past three years, the integration of LLMs into AD\n(LLM4AD) has seen substantial progress, with applications spanning\noptimization, machine learning, mathematical reasoning, and scientific\ndiscovery. Given the rapid advancements and expanding scope of this field, a\nsystematic review is both timely and necessary. This paper provides a\nsystematic review of LLM4AD. First, we offer an overview and summary of\nexisting studies. Then, we introduce a taxonomy and review the literature\nacross four dimensions: the roles of LLMs, search methods, prompt methods, and\napplication domains with a discussion of potential and achievements of LLMs in\nAD. Finally, we identify current challenges and highlight several promising\ndirections for future research.\n","authors":["Fei Liu","Yiming Yao","Ping Guo","Zhiyuan Yang","Zhe Zhao","Xi Lin","Xialiang Tong","Mingxuan Yuan","Zhichao Lu","Zhenkun Wang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.14716v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05019v2","updated":"2024-11-01T08:55:43Z","published":"2024-04-07T17:17:23Z","title":"Shortcut-connected Expert Parallelism for Accelerating\n Mixture-of-Experts","summary":" Expert parallelism has been introduced as a strategy to distribute the\ncomputational workload of sparsely-gated mixture-of-experts (MoE) models across\nmultiple computing devices, facilitating the execution of these increasingly\nlarge-scale models. However, the All-to-All communication intrinsic to expert\nparallelism constitutes a significant overhead, diminishing the MoE models'\nefficiency. Current optimization approaches offer some relief, yet they are\nconstrained by the sequential interdependence of communication and computation\noperations. To address this limitation, we present a novel shortcut-connected\nMoE (ScMoE) architecture with an overlapping parallel strategy, which\neffectively decouples communication from its conventional sequence, allowing\nfor a substantial overlap of 70% to 100% with computation. When compared with\nthe prevalent top-2 MoE architecture, ScMoE demonstrates training speed\nimprovements of 30% and 11%, and inference improvements of 40% and 15%, in our\ndistributed environments with PCIe and NVLink hardware, respectively, where\ncommunication constitutes 60% and 15% of the total MoE time consumption.\nBuilding on the ScMoE architecture, we further implement an expert offloading\nstrategy to facilitate memory-limited inference, optimizing latency through the\noverlap of expert migration. Additionally, extensive experiments and\ntheoretical analyses indicate that ScMoE not only achieves comparable but in\nsome instances surpasses the model quality of existing approaches.\n","authors":["Weilin Cai","Juyong Jiang","Le Qin","Junwei Cui","Sunghun Kim","Jiayi Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02657v2","updated":"2024-11-01T08:52:18Z","published":"2024-06-04T17:45:26Z","title":"Block Transformer: Global-to-Local Language Modeling for Fast Inference","summary":" We introduce the Block Transformer which adopts hierarchical global-to-local\nmodeling to autoregressive transformers to mitigate the inference bottlenecks\nassociated with self-attention. Self-attention requires the key-value (KV)\ncache of all previous sequences to be retrieved from memory at every decoding\nstep to retrieve context information, leading to two primary bottlenecks during\nbatch inference. First, there is a significant delay in obtaining the first\ntoken, as the information of the entire prompt must first be processed to\nprefill the KV cache. Second, computation of subsequent tokens is bottlenecked\nby the high memory I/O demand of fetching the entire KV cache, which grows\nlinearly with sequence length, incurring quadratic memory reads overall. We\ndesign the Block Transformer to strategically mitigate these costs, by\nincorporating coarsity and locality into an integrated global-to-local\narchitecture. At the lower layers, we aggregate tokens into fixed size blocks\nto apply attention across the entire sequence at coarse-grained detail, to\ncapture the global context while minimizing KV cache overhead. At upper layers,\nwe apply attention within each block to decode individual tokens, to model\nfine-grained details with a lightweight local KV cache. We pretrain vanilla and\nBlock Transformers from scratch and demonstrate that Block Transformers reach\n10--20x inference throughput compared to vanilla transformers with equivalent\nperplexity and zero-shot task performance. Code is available at\nhttps://github.com/itsnamgyu/block-transformer.\n","authors":["Namgyu Ho","Sangmin Bae","Taehyeon Kim","Hyunjik Jo","Yireun Kim","Tal Schuster","Adam Fisch","James Thorne","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2406.02657v2.pdf","comment":"37 pages, 24 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.20012v2","updated":"2024-11-01T08:40:28Z","published":"2024-09-30T07:14:31Z","title":"Towards Robust Multimodal Sentiment Analysis with Incomplete Data","summary":" The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an\nemerging direction seeking to tackle the issue of data incompleteness.\nRecognizing that the language modality typically contains dense sentiment\ninformation, we consider it as the dominant modality and present an innovative\nLanguage-dominated Noise-resistant Learning Network (LNLN) to achieve robust\nMSA. The proposed LNLN features a dominant modality correction (DMC) module and\ndominant modality based multimodal learning (DMML) module, which enhances the\nmodel's robustness across various noise scenarios by ensuring the quality of\ndominant modality representations. Aside from the methodical design, we perform\ncomprehensive experiments under random data missing scenarios, utilizing\ndiverse and meaningful settings on several popular datasets (\\textit{e.g.,}\nMOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and\nfairness compared to existing evaluations in the literature. Empirically, LNLN\nconsistently outperforms existing baselines, demonstrating superior performance\nacross these challenging and extensive evaluation metrics.\n","authors":["Haoyu Zhang","Wenbin Wang","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2409.20012v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2309.08648v4","updated":"2024-11-01T08:06:25Z","published":"2023-09-15T13:15:54Z","title":"MAPLE: Mobile App Prediction Leveraging Large Language Model Embeddings","summary":" In recent years, predicting mobile app usage has become increasingly\nimportant for areas like app recommendation, user behaviour analysis, and\nmobile resource management. Existing models, however, struggle with the\nheterogeneous nature of contextual data and the user cold start problem. This\nstudy introduces a novel prediction model, Mobile App Prediction Leveraging\nLarge Language Model Embeddings (MAPLE), which employs Large Language Models\n(LLMs) and installed app similarity to overcome these challenges. MAPLE\nutilises the power of LLMs to process contextual data and discern intricate\nrelationships within it effectively. Additionally, we explore the use of\ninstalled app similarity to address the cold start problem, facilitating the\nmodelling of user preferences and habits, even for new users with limited\nhistorical data. In essence, our research presents MAPLE as a novel, potent,\nand practical approach to app usage prediction, making significant strides in\nresolving issues faced by existing models. MAPLE stands out as a comprehensive\nand effective solution, setting a new benchmark for more precise and\npersonalised app usage predictions. In tests on two real-world datasets, MAPLE\nsurpasses contemporary models in both standard and cold start scenarios. These\noutcomes validate MAPLE's capacity for precise app usage predictions and its\nresilience against the cold start problem. This enhanced performance stems from\nthe model's proficiency in capturing complex temporal patterns and leveraging\ncontextual information. As a result, MAPLE can potentially improve personalised\nmobile app usage predictions and user experiences markedly.\n","authors":["Yonchanok Khaokaew","Hao Xue","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2309.08648v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06190v3","updated":"2024-11-01T07:53:10Z","published":"2023-06-09T18:42:19Z","title":"$FastDoc$: Domain-Specific Fast Continual Pre-training Technique using\n Document-Level Metadata and Taxonomy","summary":" In this paper, we propose $FastDoc$ (Fast Continual Pre-training Technique\nusing Document Level Metadata and Taxonomy), a novel, compute-efficient\nframework that utilizes Document metadata and Domain-Specific Taxonomy as\nsupervision signals to continually pre-train transformer encoder on a\ndomain-specific corpus. The main innovation is that during domain-specific\npretraining, an open-domain encoder is continually pre-trained using\nsentence-level embeddings as inputs (to accommodate long documents), however,\nfine-tuning is done with token-level embeddings as inputs to this encoder. We\nperform such domain-specific pre-training on three different domains namely\ncustomer support, scientific, and legal domains, and compare performance on 6\ndifferent downstream tasks and 9 different datasets. The novel use of\ndocument-level supervision along with sentence-level embedding input for\npre-training reduces pre-training compute by around $1,000$, $4,500$, and $500$\ntimes compared to MLM and/or NSP in Customer Support, Scientific, and Legal\nDomains, respectively. The reduced training time does not lead to a\ndeterioration in performance. In fact we show that $FastDoc$ either outperforms\nor performs on par with several competitive transformer-based baselines in\nterms of character-level F1 scores and other automated metrics in the Customer\nSupport, Scientific, and Legal Domains. Moreover, reduced training aids in\nmitigating the risk of catastrophic forgetting. Thus, unlike baselines,\n$FastDoc$ shows a negligible drop in performance on open domain.\n","authors":["Abhilash Nandy","Manav Nitin Kapadnis","Sohan Patnaik","Yash Parag Butala","Pawan Goyal","Niloy Ganguly"],"pdf_url":"https://arxiv.org/pdf/2306.06190v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), 36\n pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.13752v3","updated":"2024-11-01T07:51:36Z","published":"2024-04-21T19:24:15Z","title":"Adversarial Representation Engineering: A General Model Editing\n Framework for Large Language Models","summary":" Since the rapid development of Large Language Models (LLMs) has achieved\nremarkable success, understanding and rectifying their internal complex\nmechanisms has become an urgent issue. Recent research has attempted to\ninterpret their behaviors through the lens of inner representation. However,\ndeveloping practical and efficient methods for applying these representations\nfor general and flexible model editing remains challenging. In this work, we\nexplore how to leverage insights from representation engineering to guide the\nediting of LLMs by deploying a representation sensor as an editing oracle. We\nfirst identify the importance of a robust and reliable sensor during editing,\nthen propose an Adversarial Representation Engineering (ARE) framework to\nprovide a unified and interpretable approach for conceptual model editing\nwithout compromising baseline performance. Experiments on multiple tasks\ndemonstrate the effectiveness of ARE in various model editing scenarios. Our\ncode and data are available at\nhttps://github.com/Zhang-Yihao/Adversarial-Representation-Engineering.\n","authors":["Yihao Zhang","Zeming Wei","Jun Sun","Meng Sun"],"pdf_url":"https://arxiv.org/pdf/2404.13752v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.07832v4","updated":"2024-11-01T07:41:04Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11406v3","updated":"2024-11-01T06:25:15Z","published":"2024-07-16T05:48:24Z","title":"Revisiting the Impact of Pursuing Modularity for Code Generation","summary":" Modular programming, which aims to construct the final program by integrating\nsmaller, independent building blocks, has been regarded as a desirable practice\nin software development. However, with the rise of recent code generation\nagents built upon large language models (LLMs), a question emerges: is this\ntraditional practice equally effective for these new tools? In this work, we\nassess the impact of modularity in code generation by introducing a novel\nmetric for its quantitative measurement. Surprisingly, unlike conventional\nwisdom on the topic, we find that modularity is not a core factor for improving\nthe performance of code generation models. We also explore potential\nexplanations for why LLMs do not exhibit a preference for modular code compared\nto non-modular code.\n","authors":["Deokyeong Kang","Ki Jung Seo","Taeuk Kim"],"pdf_url":"https://arxiv.org/pdf/2407.11406v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.14155v2","updated":"2024-11-01T06:19:47Z","published":"2024-10-18T03:45:42Z","title":"Towards Faithful Natural Language Explanations: A Study Using Activation\n Patching in Large Language Models","summary":" Large Language Models (LLMs) are capable of generating persuasive Natural\nLanguage Explanations (NLEs) to justify their answers. However, the\nfaithfulness of these explanations should not be readily trusted at face value.\nRecent studies have proposed various methods to measure the faithfulness of\nNLEs, typically by inserting perturbations at the explanation or feature level.\nWe argue that these approaches are neither comprehensive nor correctly designed\naccording to the established definition of faithfulness. Moreover, we highlight\nthe risks of grounding faithfulness findings on out-of-distribution samples. In\nthis work, we leverage a causal mediation technique called activation patching,\nto measure the faithfulness of an explanation towards supporting the explained\nanswer. Our proposed metric, Causal Faithfulness quantifies the consistency of\ncausal attributions between explanations and the corresponding model outputs as\nthe indicator of faithfulness. We experimented across models varying from 2B to\n27B parameters and found that models that underwent alignment tuning tend to\nproduce more faithful and plausible explanations. We find that Causal\nFaithfulness is a promising improvement over existing faithfulness tests by\ntaking into account the model's internal computations and avoiding out of\ndistribution concerns that could otherwise undermine the validity of\nfaithfulness assessments. We release the code in\n\\url{https://github.com/wj210/Causal-Faithfulness}\n","authors":["Wei Jie Yeo","Ranjan Satapathy","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2410.14155v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.16247v3","updated":"2024-11-01T06:13:12Z","published":"2024-05-25T14:11:44Z","title":"AutoManual: Generating Instruction Manuals by LLM Agents via Interactive\n Environmental Learning","summary":" Large Language Models (LLM) based agents have shown promise in autonomously\ncompleting tasks across various domains, e.g., robotics, games, and web\nnavigation. However, these agents typically require elaborate design and expert\nprompts to solve tasks in specific domains, which limits their adaptability. We\nintroduce AutoManual, a framework enabling LLM agents to autonomously build\ntheir understanding through interaction and adapt to new environments.\nAutoManual categorizes environmental knowledge into diverse rules and optimizes\nthem in an online fashion by two agents: 1) The Planner codes actionable plans\nbased on current rules for interacting with the environment. 2) The Builder\nupdates the rules through a well-structured rule system that facilitates online\nrule management and essential detail retention. To mitigate hallucinations in\nmanaging rules, we introduce a *case-conditioned prompting* strategy for the\nBuilder. Finally, the Formulator agent compiles these rules into a\ncomprehensive manual. The self-generated manual can not only improve the\nadaptability but also guide the planning of smaller LLMs while being\nhuman-readable. Given only one simple demonstration, AutoManual significantly\nimproves task success rates, achieving 97.4\\% with GPT-4-turbo and 86.2\\% with\nGPT-3.5-turbo on ALFWorld benchmark tasks. The code is available at\nhttps://github.com/minghchen/automanual.\n","authors":["Minghao Chen","Yihang Li","Yanting Yang","Shiyu Yu","Binbin Lin","Xiaofei He"],"pdf_url":"https://arxiv.org/pdf/2405.16247v3.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.01548v2","updated":"2024-11-01T06:12:33Z","published":"2024-10-02T13:37:54Z","title":"In-Context Transfer Learning: Demonstration Synthesis by Transferring\n Similar Tasks","summary":" In-context learning (ICL) is an effective approach to help large language\nmodels (LLMs) adapt to various tasks by providing demonstrations of the target\ntask. Considering the high cost of labeling demonstrations, many methods\npropose synthesizing demonstrations from scratch using LLMs. However, the\nquality of the demonstrations synthesized from scratch is limited by the\ncapabilities and knowledge of LLMs. To address this, inspired by transfer\nlearning, we propose In-Context Transfer Learning (ICTL), which synthesizes\ntarget task demonstrations by transferring labeled demonstrations from similar\nsource tasks. ICTL consists of two steps: source sampling and target transfer.\nFirst, we define an optimization objective, which minimizes transfer error to\nsample source demonstrations similar to the target task. Then, we employ LLMs\nto transfer the sampled source demonstrations to the target task, matching the\ndefinition and format of the target task. Experiments on Super-NI show that\nICTL outperforms synthesis from scratch by 2.0% on average, demonstrating the\neffectiveness of our method.\n","authors":["Dingzirui Wang","Xuanliang Zhang","Qiguang Chen","Longxu Dou","Xiao Xu","Rongyu Cao","Yingwei Ma","Qingfu Zhu","Wanxiang Che","Binhua Li","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2410.01548v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13698v2","updated":"2024-11-01T06:08:08Z","published":"2024-09-05T02:24:18Z","title":"Lightweight Transducer Based on Frame-Level Criterion","summary":" The transducer model trained based on sequence-level criterion requires a lot\nof memory due to the generation of the large probability matrix. We proposed a\nlightweight transducer model based on frame-level criterion, which uses the\nresults of the CTC forced alignment algorithm to determine the label for each\nframe. Then the encoder output can be combined with the decoder output at the\ncorresponding time, rather than adding each element output by the encoder to\neach element output by the decoder as in the transducer. This significantly\nreduces memory and computation requirements. To address the problem of\nimbalanced classification caused by excessive blanks in the label, we decouple\nthe blank and non-blank probabilities and truncate the gradient of the blank\nclassifier to the main network. Experiments on the AISHELL-1 demonstrate that\nthis enables the lightweight transducer to achieve similar results to\ntransducer. Additionally, we use richer information to predict the probability\nof blank, achieving superior results to transducer.\n","authors":["Genshun Wan","Mengzhi Wang","Tingzhi Mao","Hang Chen","Zhongfu Ye"],"pdf_url":"https://arxiv.org/pdf/2409.13698v2.pdf","comment":"Accepted by Interspeech 2024, code repository:\n https://github.com/wangmengzhi/Lightweight-Transducer"},{"id":"http://arxiv.org/abs/2406.04744v2","updated":"2024-11-01T05:30:17Z","published":"2024-06-07T08:43:07Z","title":"CRAG -- Comprehensive RAG Benchmark","summary":" Retrieval-Augmented Generation (RAG) has recently emerged as a promising\nsolution to alleviate Large Language Model (LLM)'s deficiency in lack of\nknowledge. Existing RAG datasets, however, do not adequately represent the\ndiverse and dynamic nature of real-world Question Answering (QA) tasks. To\nbridge this gap, we introduce the Comprehensive RAG Benchmark (CRAG), a factual\nquestion answering benchmark of 4,409 question-answer pairs and mock APIs to\nsimulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a\ndiverse array of questions across five domains and eight question categories,\nreflecting varied entity popularity from popular to long-tail, and temporal\ndynamisms ranging from years to seconds. Our evaluation of this benchmark\nhighlights the gap to fully trustworthy QA. Whereas most advanced LLMs achieve\n<=34% accuracy on CRAG, adding RAG in a straightforward manner improves the\naccuracy only to 44%. State-of-the-art industry RAG solutions only answer 63%\nof questions without any hallucination. CRAG also reveals much lower accuracy\nin answering questions regarding facts with higher dynamism, lower popularity,\nor higher complexity, suggesting future research directions. The CRAG benchmark\nlaid the groundwork for a KDD Cup 2024 challenge and attracted thousands of\nparticipants and submissions. We commit to maintaining CRAG to serve research\ncommunities in advancing RAG solutions and general QA solutions. CRAG is\navailable at https://github.com/facebookresearch/CRAG/.\n","authors":["Xiao Yang","Kai Sun","Hao Xin","Yushi Sun","Nikita Bhalla","Xiangsen Chen","Sajal Choudhary","Rongze Daniel Gui","Ziran Will Jiang","Ziyu Jiang","Lingkun Kong","Brian Moran","Jiaqi Wang","Yifan Ethan Xu","An Yan","Chenyu Yang","Eting Yuan","Hanwen Zha","Nan Tang","Lei Chen","Nicolas Scheffer","Yue Liu","Nirav Shah","Rakesh Wanga","Anuj Kumar","Wen-tau Yih","Xin Luna Dong"],"pdf_url":"https://arxiv.org/pdf/2406.04744v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2409.11724v2","updated":"2024-11-01T04:19:21Z","published":"2024-09-18T06:19:59Z","title":"TART: An Open-Source Tool-Augmented Framework for Explainable\n Table-based Reasoning","summary":" Current Large Language Models (LLMs) exhibit limited ability to understand\ntable structures and to apply precise numerical reasoning, which is crucial for\ntasks such as table question answering (TQA) and table-based fact verification\n(TFV). To address these challenges, we introduce our Tool-Augmented Reasoning\nframework for Tables (TART), which integrates LLMs with specialized tools. TART\ncontains three key components: a table formatter to ensure accurate data\nrepresentation, a tool maker to develop specific computational tools, and an\nexplanation generator to maintain explainability. We also present the TOOLTAB\ndataset, a new benchmark designed specifically for training LLMs in table-tool\nintegration. Our experiments indicate that TART achieves substantial\nimprovements over existing methods (e.g., Chain-of-Thought) by improving both\nthe precision of data processing and the clarity of the reasoning process.\nNotably, TART paired with CodeLlama achieves 90.0% of the accuracy of the\nclosed-sourced LLM GPT-3.5-turbo, highlighting its robustness in diverse\nreal-world scenarios. All the code and data are available at\nhttps://github.com/XinyuanLu00/TART.\n","authors":["Xinyuan Lu","Liangming Pan","Yubo Ma","Preslav Nakov","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2409.11724v2.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2402.01763v3","updated":"2024-11-01T03:49:59Z","published":"2024-01-30T23:35:28Z","title":"When Large Language Models Meet Vector Databases: A Survey","summary":" This survey explores the synergistic potential of Large Language Models\n(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving\nresearch area. With the proliferation of LLMs comes a host of challenges,\nincluding hallucinations, outdated knowledge, prohibitive commercial\napplication costs, and memory issues. VecDBs emerge as a compelling solution to\nthese issues by offering an efficient means to store, retrieve, and manage the\nhigh-dimensional vector representations intrinsic to LLM operations. Through\nthis nuanced review, we delineate the foundational principles of LLMs and\nVecDBs and critically analyze their integration's impact on enhancing LLM\nfunctionalities. This discourse extends into a discussion on the speculative\nfuture developments in this domain, aiming to catalyze further research into\noptimizing the confluence of LLMs and VecDBs for advanced data handling and\nknowledge extraction capabilities.\n","authors":["Zhi Jing","Yongye Su","Yikun Han"],"pdf_url":"https://arxiv.org/pdf/2402.01763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02428v3","updated":"2024-11-01T03:47:51Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Achieving the effective design and improvement of reward functions in\nreinforcement learning (RL) tasks with complex custom environments and multiple\nrequirements presents considerable challenges. In this paper, we propose ERFSL,\nan efficient reward function searcher using LLMs, which enables LLMs to be\neffective white-box searchers and highlights their advanced semantic\nunderstanding capabilities. Specifically, we generate reward components for\neach numerically explicit user requirement and employ a reward critic to\nidentify the correct code form. Then, LLMs assign weights to the reward\ncomponents to balance their values and iteratively adjust the weights without\nambiguity and redundant adjustments by flexibly adopting directional mutation\nand crossover strategies, similar to genetic algorithms, based on the context\nprovided by the training log analyzer. We applied the framework to an\nunderwater data collection RL task without direct human feedback or reward\nexamples (zero-shot learning). The reward critic successfully corrects the\nreward code with only one feedback instance for each requirement, effectively\npreventing unrectifiable errors. The initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout the need for weight search. Even in cases where a weight is 500 times\noff, on average, only 5.2 iterations are needed to meet user requirements. The\nERFSL also works well with most prompts utilizing GPT-4o mini, as we decompose\nthe weight searching process to reduce the requirement for numerical and\nlong-context understanding capabilities\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04501v3","updated":"2024-11-01T03:42:37Z","published":"2024-10-06T14:45:01Z","title":"Leveraging Large Language Models for Suicide Detection on Social Media\n with Limited Labels","summary":" The increasing frequency of suicidal thoughts highlights the importance of\nearly detection and intervention. Social media platforms, where users often\nshare personal experiences and seek help, could be utilized to identify\nindividuals at risk. However, the large volume of daily posts makes manual\nreview impractical. This paper explores the use of Large Language Models (LLMs)\nto automatically detect suicidal content in text-based social media posts. We\npropose a novel method for generating pseudo-labels for unlabeled data by\nprompting LLMs, along with traditional classification fine-tuning techniques to\nenhance label accuracy. To create a strong suicide detection model, we develop\nan ensemble approach involving prompting with Qwen2-72B-Instruct, and using\nfine-tuned models such as Llama3-8B, Llama3.1-8B, and Gemma2-9B. We evaluate\nour approach on the dataset of the Suicide Ideation Detection on Social Media\nChallenge, a track of the IEEE Big Data 2024 Big Data Cup. Additionally, we\nconduct a comprehensive analysis to assess the impact of different models and\nfine-tuning strategies on detection performance. Experimental results show that\nthe ensemble model significantly improves the detection accuracy, by 5% points\ncompared with the individual models. It achieves a weight F1 score of 0.770 on\nthe public test set, and 0.731 on the private test set, providing a promising\nsolution for identifying suicidal content in social media. Our analysis shows\nthat the choice of LLMs affects the prompting performance, with larger models\nproviding better accuracy. Our code and checkpoints are publicly available at\nhttps://github.com/khanhvynguyen/Suicide_Detection_LLMs.\n","authors":["Vy Nguyen","Chau Pham"],"pdf_url":"https://arxiv.org/pdf/2410.04501v3.pdf","comment":"Accepted at IEEE International Conference on Big Data 2024"},{"id":"http://arxiv.org/abs/2410.19878v2","updated":"2024-11-01T03:26:07Z","published":"2024-10-24T13:58:59Z","title":"Parameter-Efficient Fine-Tuning in Large Models: A Survey of\n Methodologies","summary":" The large models, as predicted by scaling raw forecasts, have made\ngroundbreaking progress in many fields, particularly in natural language\ngeneration tasks, where they have approached or even surpassed human levels.\nHowever, the unprecedented scale of their parameters brings significant\ncomputational and storage costs. These large models require substantial\ncomputational resources and GPU memory to operate. When adapting large models\nto specific downstream tasks, their massive parameter scale poses a significant\nchallenge in fine-tuning on hardware platforms with limited computational power\nand GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT)\noffers a practical solution by efficiently adjusting the parameters of large\npre-trained models to suit various downstream tasks. Specifically, PEFT adjusts\nthe parameters of pre-trained large models to adapt to specific tasks or\ndomains, minimizing the introduction of additional parameters and the\ncomputational resources required. This review mainly introduces the preliminary\nknowledge of PEFT, the core ideas and principles of various PEFT algorithms,\nthe applications of PEFT, and potential future research directions. By reading\nthis review, we believe that interested parties can quickly grasp the PEFT\nmethodology, thereby accelerating its development and innovation.\n","authors":["Luping Wang","Sheng Chen","Linnan Jiang","Shu Pan","Runze Cai","Sen Yang","Fei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.19878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13056v2","updated":"2024-11-01T03:16:30Z","published":"2024-10-16T21:34:41Z","title":"Channel-Wise Mixed-Precision Quantization for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable success across a\nwide range of language tasks, but their deployment on edge devices remains\nchallenging due to the substantial memory requirements imposed by their large\nparameter sizes. Weight-only quantization presents a promising solution to\nreduce the memory footprint of LLMs. However, existing approaches primarily\nfocus on integer-bit quantization, limiting their adaptability to\nfractional-bit quantization tasks and preventing the full utilization of\navailable storage space on devices. In this paper, we introduce Channel-Wise\nMixed-Precision Quantization (CMPQ), a novel mixed-precision quantization\nmethod that allocates quantization precision in a channel-wise pattern based on\nactivation distributions. By assigning different precision levels to different\nweight channels, CMPQ can adapt to any bit-width constraint. CMPQ employs a\nnon-uniform quantization strategy and incorporates two outlier extraction\ntechniques that collaboratively preserve the critical information, thereby\nminimizing the quantization loss. Experiments on different sizes of LLMs\ndemonstrate that CMPQ not only enhances performance in integer-bit quantization\ntasks but also achieves significant performance gains with a modest increase in\nmemory usage. CMPQ thus represents an adaptive and effective approach to LLM\nquantization, offering substantial benefits across diverse device capabilities.\n","authors":["Zihan Chen","Bike Xie","Jundong Li","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2410.13056v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20646v2","updated":"2024-11-01T03:12:44Z","published":"2024-05-31T07:24:42Z","title":"LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential\n Recommendation","summary":" Sequential recommender systems (SRS) aim to predict users' subsequent choices\nbased on their historical interactions and have found applications in diverse\nfields such as e-commerce and social media. However, in real-world systems,\nmost users interact with only a handful of items, while the majority of items\nare seldom consumed. These two issues, known as the long-tail user and\nlong-tail item challenges, often pose difficulties for existing SRS. These\nchallenges can adversely affect user experience and seller benefits, making\nthem crucial to address. Though a few works have addressed the challenges, they\nstill struggle with the seesaw or noisy issues due to the intrinsic scarcity of\ninteractions. The advancements in large language models (LLMs) present a\npromising solution to these problems from a semantic perspective. As one of the\npioneers in this field, we propose the Large Language Models Enhancement\nframework for Sequential Recommendation (LLM-ESR). This framework utilizes\nsemantic embeddings derived from LLMs to enhance SRS without adding extra\ninference load from LLMs. To address the long-tail item challenge, we design a\ndual-view modeling framework that combines semantics from LLMs and\ncollaborative signals from conventional SRS. For the long-tail user challenge,\nwe propose a retrieval augmented self-distillation method to enhance user\npreference representation using more informative interactions from similar\nusers. To verify the effectiveness and versatility of our proposed enhancement\nframework, we conduct extensive experiments on three real-world datasets using\nthree popular SRS models. The results show that our method surpasses existing\nbaselines consistently, and benefits long-tail users and items especially. The\nimplementation code is available at\nhttps://github.com/Applied-Machine-Learning-Lab/LLM-ESR.\n","authors":["Qidong Liu","Xian Wu","Yejing Wang","Zijian Zhang","Feng Tian","Yefeng Zheng","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20646v2.pdf","comment":"accepted by NeruIPS'24 (Spotlight)"},{"id":"http://arxiv.org/abs/2302.04391v8","updated":"2024-11-01T02:49:24Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The dev dataset evaluation\nresults and human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09131v4","updated":"2024-11-01T02:43:34Z","published":"2024-03-14T06:49:16Z","title":"ProSwitch: Knowledge-Guided Instruction Tuning to Switch Between\n Professional and Non-Professional Answers","summary":" Large Language Models (LLMs) have demonstrated efficacy in various linguistic\napplications, including text summarization and controlled text generation.\nHowever, studies into their capacity of switching between styles via\ninstruction tuning remain underexplored. This study concentrates on the\nstyle-switching abilities of LLMs and introduces a novel approach, named\nProSwitch, which enables a language model to switch between professional and\nnon-professional answers, by tuning and evaluating through the guidance of\ndomain and style knowledge. ProSwitch unfolds across three phases:\nLLM-augmented preparation to collect domain knowledge and QA pairs, instruction\ntuning to optimize LLMs with multiple levels of knowledge, and comprehensive\nevaluation to assess both style discrimination and reference-based quality of\ngenerated text. Comparative analysis of ProSwitch against general and\nspecialized LLMs reveals that our approach outperforms baselines in switching\nbetween professional and non-professional answers.\n","authors":["Chang Zong","Yuyan Chen","Weiming Lu","Jian Shao","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2403.09131v4.pdf","comment":"8 pages main body, 16 pages total"},{"id":"http://arxiv.org/abs/2407.13623v3","updated":"2024-11-01T02:41:36Z","published":"2024-07-18T15:58:54Z","title":"Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies","summary":" Research on scaling large language models (LLMs) has primarily focused on\nmodel parameters and training data size, overlooking the role of vocabulary\nsize. We investigate how vocabulary size impacts LLM scaling laws by training\nmodels ranging from 33M to 3B parameters on up to 500B characters with various\nvocabulary configurations. We propose three complementary approaches for\npredicting the compute-optimal vocabulary size: IsoFLOPs analysis, derivative\nestimation, and parametric fit of the loss function. Our approaches converge on\nthe conclusion that the optimal vocabulary size depends on the compute budget,\nwith larger models requiring larger vocabularies. Most LLMs, however, use\ninsufficient vocabulary sizes. For example, we predict that the optimal\nvocabulary size of Llama2-70B should have been at least 216K, 7 times larger\nthan its vocabulary of 32K. We validate our predictions empirically by training\nmodels with 3B parameters across different FLOPs budgets. Adopting our\npredicted optimal vocabulary size consistently improves downstream performance\nover commonly used vocabulary sizes. By increasing the vocabulary size from the\nconventional 32K to 43K, we improve performance on ARC-Challenge from 29.1 to\n32.0 with the same 2.3e21 FLOPs. Our work highlights the importance of jointly\nconsidering tokenization and model scaling for efficient pre-training. The code\nand demo are available at https://github.com/sail-sg/scaling-with-vocab and\nhttps://hf.co/spaces/sail/scaling-with-vocab-demo.\n","authors":["Chaofan Tao","Qian Liu","Longxu Dou","Niklas Muennighoff","Zhongwei Wan","Ping Luo","Min Lin","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2407.13623v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.14909v2","updated":"2024-11-01T02:26:18Z","published":"2024-06-21T06:58:37Z","title":"MoA: Mixture of Sparse Attention for Automatic Large Language Model\n Compression","summary":" Sparse attention can effectively mitigate the significant memory and\nthroughput demands of Large Language Models (LLMs) in long contexts. Existing\nmethods typically employ a uniform sparse attention mask, applying the same\nsparse pattern across different attention heads and input lengths. However,\nthis uniform approach fails to capture the diverse attention patterns inherent\nin LLMs, ignoring their distinct accuracy-latency trade-offs. To address this\nchallenge, we propose the Mixture of Attention (MoA), which automatically\ntailors distinct sparse attention configurations to different heads and layers.\nMoA constructs and navigates a search space of various attention patterns and\ntheir scaling rules relative to input sequence lengths. It profiles the model,\nevaluates potential configurations, and pinpoints the optimal sparse attention\ncompression plan. MoA adapts to varying input sizes, revealing that some\nattention heads expand their focus to accommodate longer sequences, while other\nheads consistently concentrate on fixed-length local contexts. Experiments show\nthat MoA increases the effective context length by $3.9\\times$ with the same\naverage attention span, boosting retrieval accuracy by $1.5-7.1\\times$ over the\nuniform-attention baseline across Vicuna-{7B,13B}, and Llama3-{8B,70B} models.\nMoreover, MoA narrows the capability gaps between sparse and dense models,\nreducing the maximum relative performance drop from $9\\%-36\\%$ to within $5\\%$\nacross two long-context understanding benchmarks. MoA achieves a\n$1.2-1.4\\times$ GPU memory reduction, boosting decode throughput by\n$6.6-8.2\\times$ and $1.7-1.9\\times$ compared to FlashAttention2 and vLLM, with\nminimal impact on performance. Our code is available at\n\\url{https://github.com/thu-nics/MoA}.\n","authors":["Tianyu Fu","Haofeng Huang","Xuefei Ning","Genghan Zhang","Boju Chen","Tianqi Wu","Hongyi Wang","Zixiao Huang","Shiyao Li","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.14909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02834v3","updated":"2024-11-01T02:21:13Z","published":"2024-09-04T16:00:21Z","title":"CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the\n Mathematics Reasoning of Large Multimodal Models","summary":" Large language models (LLMs) have obtained promising results in mathematical\nreasoning, which is a foundational skill for human intelligence. Most previous\nstudies focus on improving and measuring the performance of LLMs based on\ntextual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few\nresearchers have released English multimodal math datasets (e.g., MATHVISTA and\nMATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In\nthis paper, we release a Chinese multimodal math (CMM-Math) dataset, including\nbenchmark and training parts, to evaluate and enhance the mathematical\nreasoning of LMMs. CMM-Math contains over 28,000 high-quality samples,\nfeaturing a variety of problem types (e.g., multiple-choice, fill-in-the-blank,\nand so on) with detailed solutions across 12 grade levels from elementary to\nhigh school in China. Specifically, the visual context may be present in the\nquestions or opinions, which makes this dataset more challenging. Through\ncomprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math\ndataset face challenges, emphasizing the necessity for further improvements in\nLMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to\nhandle the problems with mixed input of multiple images and text segments. We\ntrain our model using three stages, including foundational pre-training,\nfoundational fine-tuning, and mathematical fine-tuning. The extensive\nexperiments indicate that our model effectively improves math reasoning\nperformance by comparing it with the SOTA LMMs over three multimodal\nmathematical datasets.\n","authors":["Wentao Liu","Qianjun Pan","Yi Zhang","Zhuo Liu","Ji Wu","Jie Zhou","Aimin Zhou","Qin Chen","Bo Jiang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2409.02834v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17213v4","updated":"2024-11-01T02:08:03Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by democratic deliberation theory, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24126v2","updated":"2024-11-01T01:49:56Z","published":"2024-10-31T16:50:39Z","title":"Multi-environment Topic Models","summary":" Probabilistic topic models are a powerful tool for extracting latent themes\nfrom large text datasets. In many text datasets, we also observe per-document\ncovariates (e.g., source, style, political affiliation) that act as\nenvironments that modulate a \"global\" (environment-agnostic) topic\nrepresentation. Accurately learning these representations is important for\nprediction on new documents in unseen environments and for estimating the\ncausal effect of topics on real-world outcomes. To this end, we introduce the\nMulti-environment Topic Model (MTM), an unsupervised probabilistic model that\nseparates global and environment-specific terms. Through experimentation on\nvarious political content, from ads to tweets and speeches, we show that the\nMTM produces interpretable global topics with distinct environment-specific\nwords. On multi-environment data, the MTM outperforms strong baselines in and\nout-of-distribution. It also enables the discovery of accurate causal effects.\n","authors":["Dominic Sobhani","Amir Feder","David Blei"],"pdf_url":"https://arxiv.org/pdf/2410.24126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16342v2","updated":"2024-11-01T01:16:28Z","published":"2024-06-24T06:27:47Z","title":"Is your benchmark truly adversarial? AdvScore: Evaluating Human-Grounded\n Adversarialness","summary":" Adversarial datasets should ensure AI robustness that matches human\nperformance. However, as models evolve, datasets can become obsolete. Thus,\nadversarial datasets should be periodically updated based on their degradation\nin adversarialness. Given the lack of a standardized metric for measuring\nadversarialness, we propose AdvScore, a human-grounded evaluation metric.\nAdvScore assesses a dataset's true adversarialness by capturing models' and\nhumans' varying abilities, while also identifying poor examples. AdvScore then\nmotivates a new dataset creation pipeline for realistic and high-quality\nadversarial samples, enabling us to collect an adversarial question answering\n(QA) dataset, AdvQA. We apply AdvScore using 9,347 human responses and ten\nlanguage model predictions to track the models' improvement over five years\n(from 2020 to 2024). AdvScore assesses whether adversarial datasets remain\nsuitable for model evaluation, measures model improvements, and provides\nguidance for better alignment with human capabilities.\n","authors":["Yoo Yeon Sung","Maharshi Gor","Eve Fleisig","Ishani Mondal","Jordan Lee Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2406.16342v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.11185"},{"id":"http://arxiv.org/abs/2408.02128v2","updated":"2024-11-01T00:34:19Z","published":"2024-08-04T19:54:12Z","title":"Table Transformers for Imputing Textual Attributes","summary":" Missing data in tabular dataset is a common issue as the performance of\ndownstream tasks usually depends on the completeness of the training dataset.\nPrevious missing data imputation methods focus on numeric and categorical\ncolumns, but we propose a novel end-to-end approach called Table Transformers\nfor Imputing Textual Attributes (TTITA) based on the transformer to impute\nunstructured textual columns using other columns in the table. We conduct\nextensive experiments on three datasets, and our approach shows competitive\nperformance outperforming baseline models such as recurrent neural networks and\nLlama2. The performance improvement is more significant when the target\nsequence has a longer length. Additionally, we incorporate multi-task learning\nto simultaneously impute for heterogeneous columns, boosting the performance\nfor text imputation. We also qualitatively compare with ChatGPT for realistic\napplications.\n","authors":["Ting-Ruen Wei","Yuan Wang","Yoshitaka Inoue","Hsin-Tai Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2408.02128v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.16016v2","updated":"2024-11-01T17:44:34Z","published":"2024-09-24T12:19:31Z","title":"VascX Models: Model Ensembles for Retinal Vascular Analysis from Color\n Fundus Images","summary":" We introduce VascX models, a comprehensive set of model ensembles for\nanalyzing retinal vasculature from color fundus images (CFIs). Annotated CFIs\nwere aggregated from public datasets . Additional CFIs, mainly from the\npopulation-based Rotterdam Study were annotated by graders for arteries and\nveins at pixel level, resulting in a dataset diverse in patient demographics\nand imaging conditions. VascX models demonstrated superior segmentation\nperformance across datasets, image quality levels, and anatomic regions when\ncompared to existing, publicly available models, likely due to the increased\nsize and variety of our training set. Important improvements were observed in\nartery-vein and disc segmentation performance, particularly in segmentations of\nthese structures on CFIs of intermediate quality, common in large cohorts and\nclinical datasets. Importantly, these improvements translated into\nsignificantly more accurate vascular features when we compared features\nextracted from VascX segmentation masks with features extracted from\nsegmentation masks generated by previous models. With VascX models we provide a\nrobust, ready-to-use set of model ensembles and inference code aimed at\nsimplifying the implementation and enhancing the quality of automated retinal\nvasculature analyses. The precise vessel parameters generated by the model can\nserve as starting points for the identification of disease patterns in and\noutside of the eye.\n","authors":["Jose Vargas Quiros","Bart Liefers","Karin van Garderen","Jeroen Vermeulen","Eyened Reading Center","Sinergia Consortium","Caroline Klaver"],"pdf_url":"https://arxiv.org/pdf/2409.16016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24211v2","updated":"2024-11-01T17:23:01Z","published":"2024-10-31T17:59:01Z","title":"DELTA: Dense Efficient Long-range 3D Tracking for any video","summary":" Tracking dense 3D motion from monocular videos remains challenging,\nparticularly when aiming for pixel-level precision over long sequences. We\nintroduce DELTA, a novel method that efficiently tracks every pixel in 3D\nspace, enabling accurate motion estimation across entire videos. Our approach\nleverages a joint global-local attention mechanism for reduced-resolution\ntracking, followed by a transformer-based upsampler to achieve high-resolution\npredictions. Unlike existing methods, which are limited by computational\ninefficiency or sparse tracking, DELTA delivers dense 3D tracking at scale,\nrunning over 8x faster than previous methods while achieving state-of-the-art\naccuracy. Furthermore, we explore the impact of depth representation on\ntracking performance and identify log-depth as the optimal choice. Extensive\nexperiments demonstrate the superiority of DELTA on multiple benchmarks,\nachieving new state-of-the-art results in both 2D and 3D dense tracking tasks.\nOur method provides a robust solution for applications requiring fine-grained,\nlong-term motion tracking in 3D space.\n","authors":["Tuan Duc Ngo","Peiye Zhuang","Chuang Gan","Evangelos Kalogerakis","Sergey Tulyakov","Hsin-Ying Lee","Chaoyang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24211v2.pdf","comment":"Project Page: https://snap-research.github.io/DELTA/"},{"id":"http://arxiv.org/abs/2402.01335v3","updated":"2024-11-01T16:51:01Z","published":"2024-02-02T11:40:27Z","title":"BehAVE: Behaviour Alignment of Video Game Encodings","summary":" Domain randomisation enhances the transferability of vision models across\nvisually distinct domains with similar content. However, current methods\nheavily depend on intricate simulation engines, hampering feasibility and\nscalability. This paper introduces BehAVE, a video understanding framework that\nutilises existing commercial video games for domain randomisation without\naccessing their simulation engines. BehAVE taps into the visual diversity of\nvideo games for randomisation and uses textual descriptions of player actions\nto align videos with similar content. We evaluate BehAVE across 25 first-person\nshooter (FPS) games using various video and text foundation models,\ndemonstrating its robustness in domain randomisation. BehAVE effectively aligns\nplayer behavioural patterns and achieves zero-shot transfer to multiple unseen\nFPS games when trained on just one game. In a more challenging scenario, BehAVE\nenhances the zero-shot transferability of foundation models to unseen FPS\ngames, even when trained on a game of a different genre, with improvements of\nup to 22%. BehAVE is available online at https://github.com/nrasajski/BehAVE.\n","authors":["Nemanja Rašajski","Chintan Trivedi","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2402.01335v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07410v2","updated":"2024-11-01T16:34:04Z","published":"2024-10-09T20:21:43Z","title":"Aligning Motion-Blurred Images Using Contrastive Learning on\n Overcomplete Pixels","summary":" We propose a new contrastive objective for learning overcomplete pixel-level\nfeatures that are invariant to motion blur. Other invariances (e.g., pose,\nillumination, or weather) can be learned by applying the corresponding\ntransformations on unlabeled images during self-supervised training. We\nshowcase that a simple U-Net trained with our objective can produce local\nfeatures useful for aligning the frames of an unseen video captured with a\nmoving camera under realistic and challenging conditions. Using a carefully\ndesigned toy example, we also show that the overcomplete pixels can encode the\nidentity of objects in an image and the pixel coordinates relative to these\nobjects.\n","authors":["Leonid Pogorelyuk","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2410.07410v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.24204v2","updated":"2024-11-01T16:31:22Z","published":"2024-10-31T17:57:07Z","title":"GeoSplatting: Towards Geometry Guided Gaussian Splatting for\n Physically-based Inverse Rendering","summary":" We consider the problem of physically-based inverse rendering using 3D\nGaussian Splatting (3DGS) representations. While recent 3DGS methods have\nachieved remarkable results in novel view synthesis (NVS), accurately capturing\nhigh-fidelity geometry, physically interpretable materials and lighting remains\nchallenging, as it requires precise geometry modeling to provide accurate\nsurface normals, along with physically-based rendering (PBR) techniques to\nensure correct material and lighting disentanglement. Previous 3DGS methods\nresort to approximating surface normals, but often struggle with noisy local\ngeometry, leading to inaccurate normal estimation and suboptimal\nmaterial-lighting decomposition. In this paper, we introduce GeoSplatting, a\nnovel hybrid representation that augments 3DGS with explicit geometric guidance\nand differentiable PBR equations. Specifically, we bridge isosurface and 3DGS\ntogether, where we first extract isosurface mesh from a scalar field, then\nconvert it into 3DGS points and formulate PBR equations for them in a fully\ndifferentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry,\nenabling precise surface normal modeling, which facilitates the use of PBR\nframeworks for material decomposition. This approach further maintains the\nefficiency and quality of NVS from 3DGS while ensuring accurate geometry from\nthe isosurface. Comprehensive evaluations across diverse datasets demonstrate\nthe superiority of GeoSplatting, consistently outperforming existing methods\nboth quantitatively and qualitatively.\n","authors":["Kai Ye","Chong Gao","Guanbin Li","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2410.24204v2.pdf","comment":"Project page: https://pku-vcl-geometry.github.io/GeoSplatting/"},{"id":"http://arxiv.org/abs/2406.00307v4","updated":"2024-11-01T16:26:40Z","published":"2024-06-01T05:41:12Z","title":"HENASY: Learning to Assemble Scene-Entities for Egocentric\n Video-Language Model","summary":" Current video-language models (VLMs) rely extensively on instance-level\nalignment between video and language modalities, which presents two major\nlimitations: (1) visual reasoning disobeys the natural perception that humans\ndo in first-person perspective, leading to a lack of reasoning interpretation;\nand (2) learning is limited in capturing inherent fine-grained relationships\nbetween two modalities.\n In this paper, we take an inspiration from human perception and explore a\ncompositional approach for egocentric video representation. We introduce HENASY\n(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token\ngrouping mechanism to explicitly assemble dynamically evolving scene entities\nthrough time and model their relationship for video representation. By\nleveraging compositional structure understanding, HENASY possesses strong\ninterpretability via visual grounding with free-form text queries. We further\nexplore a suite of multi-grained contrastive losses to facilitate\nentity-centric understandings. This comprises three alignment types:\nvideo-narration, noun-entity, verb-entities alignments.\n Our method demonstrates strong interpretability in both quantitative and\nqualitative experiments; while maintaining competitive performances on five\ndownstream tasks via zero-shot transfer or as video/text representation,\nincluding video/text retrieval, action recognition, multi-choice query, natural\nlanguage query, and moments query.\n","authors":["Khoa Vo","Thinh Phan","Kashu Yamazaki","Minh Tran","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2406.00307v4.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.14556v3","updated":"2024-11-01T16:12:52Z","published":"2023-12-22T09:29:45Z","title":"CaptainCook4D: A Dataset for Understanding Errors in Procedural\n Activities","summary":" Following step-by-step procedures is an essential component of various\nactivities carried out by individuals in their daily lives. These procedures\nserve as a guiding framework that helps to achieve goals efficiently, whether\nit is assembling furniture or preparing a recipe. However, the complexity and\nduration of procedural activities inherently increase the likelihood of making\nerrors. Understanding such procedural activities from a sequence of frames is a\nchallenging task that demands an accurate interpretation of visual information\nand the ability to reason about the structure of the activity. To this end, we\ncollect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings\n(94.5 hours) of people performing recipes in real kitchen environments. This\ndataset consists of two distinct types of activity: one in which participants\nadhere to the provided recipe instructions and another in which they deviate\nand induce errors. We provide 5.3K step annotations and 10K fine-grained action\nannotations and benchmark the dataset for the following tasks: supervised error\nrecognition, multistep localization, and procedure learning\n","authors":["Rohith Peddi","Shivvrat Arya","Bharath Challa","Likhitha Pallapothula","Akshay Vyas","Bhavya Gouripeddi","Jikai Wang","Qifan Zhang","Vasundhara Komaragiri","Eric Ragan","Nicholas Ruozzi","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2312.14556v3.pdf","comment":"Accepted to the 2024 Neural Information Processing Systems Datasets\n and Benchmarks Track, Project Page:\n https://captaincook4d.github.io/captain-cook/"},{"id":"http://arxiv.org/abs/2410.19869v2","updated":"2024-11-01T16:02:47Z","published":"2024-10-24T00:12:20Z","title":"Comparing YOLO11 and YOLOv8 for instance segmentation of occluded and\n non-occluded immature green fruits in complex orchard environment","summary":" This study conducted a comprehensive performance evaluation on YOLO11 and\nYOLOv8, the latest in the \"You Only Look Once\" (YOLO) series, focusing on their\ninstance segmentation capabilities for immature green apples in orchard\nenvironments. YOLO11n-seg achieved the highest mask precision across all\ncategories with a notable score of 0.831, highlighting its effectiveness in\nfruit detection. YOLO11m-seg and YOLO11l-seg excelled in non-occluded and\noccluded fruitlet segmentation with scores of 0.851 and 0.829, respectively.\nAdditionally, YOLO11x-seg led in mask recall for all categories, achieving a\nscore of 0.815, with YOLO11m-seg performing best for non-occluded immature\ngreen fruitlets at 0.858 and YOLOv8x-seg leading the occluded category with\n0.800. In terms of mean average precision at a 50\\% intersection over union\n(mAP@50), YOLO11m-seg consistently outperformed, registering the highest scores\nfor both box and mask segmentation, at 0.876 and 0.860 for the \"All\" class and\n0.908 and 0.909 for non-occluded immature fruitlets, respectively. YOLO11l-seg\nand YOLOv8l-seg shared the top box mAP@50 for occluded immature fruitlets at\n0.847, while YOLO11m-seg achieved the highest mask mAP@50 of 0.810. Despite the\nadvancements in YOLO11, YOLOv8n surpassed its counterparts in image processing\nspeed, with an impressive inference speed of 3.3 milliseconds, compared to the\nfastest YOLO11 series model at 4.8 milliseconds, underscoring its suitability\nfor real-time agricultural applications related to complex green fruit\nenvironments.\n","authors":["Ranjan Sapkota","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2410.19869v2.pdf","comment":"16 Pages, 10 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2409.00877v2","updated":"2024-11-01T15:41:56Z","published":"2024-09-02T00:11:48Z","title":"Digital Twins in Additive Manufacturing: A Systematic Review","summary":" Digital Twins (DTs) are becoming popular in Additive Manufacturing (AM) due\nto their ability to create virtual replicas of physical components of AM\nmachines, which helps in real-time production monitoring. Advanced techniques\nsuch as Machine Learning (ML), Augmented Reality (AR), and simulation-based\nmodels play key roles in developing intelligent and adaptable DTs in\nmanufacturing processes. However, questions remain regarding scalability, the\nintegration of high-quality data, and the computational power required for\nreal-time applications in developing DTs. Understanding the current state of\nDTs in AM is essential to address these challenges and fully utilize their\npotential in advancing AM processes. Considering this opportunity, this work\naims to provide a comprehensive overview of DTs in AM by addressing the\nfollowing four research questions: (1) What are the key types of DTs used in AM\nand their specific applications? (2) What are the recent developments and\nimplementations of DTs? (3) How are DTs employed in process improvement and\nhybrid manufacturing? (4) How are DTs integrated with Industry 4.0\ntechnologies? By discussing current applications and techniques, we aim to\noffer a better understanding and potential future research directions for\nresearchers and practitioners in AM and DTs.\n","authors":["Md Manjurul Ahsan","Yingtao Liu","Shivakumar Raman","Zahed Siddique"],"pdf_url":"https://arxiv.org/pdf/2409.00877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15615v4","updated":"2024-11-01T15:13:01Z","published":"2023-07-28T15:22:34Z","title":"A survey on deep learning in medical image registration: new\n technologies, uncertainty, evaluation metrics, and beyond","summary":" Deep learning technologies have dramatically reshaped the field of medical\nimage registration over the past decade. The initial developments, such as\nregression-based and U-Net-based networks, established the foundation for deep\nlearning in image registration. Subsequent progress has been made in various\naspects of deep learning-based registration, including similarity measures,\ndeformation regularizations, network architectures, and uncertainty estimation.\nThese advancements have not only enriched the field of image registration but\nhave also facilitated its application in a wide range of tasks, including atlas\nconstruction, multi-atlas segmentation, motion estimation, and 2D-3D\nregistration. In this paper, we present a comprehensive overview of the most\nrecent advancements in deep learning-based image registration. We begin with a\nconcise introduction to the core concepts of deep learning-based image\nregistration. Then, we delve into innovative network architectures, loss\nfunctions specific to registration, and methods for estimating registration\nuncertainty. Additionally, this paper explores appropriate evaluation metrics\nfor assessing the performance of deep learning models in registration tasks.\nFinally, we highlight the practical applications of these novel techniques in\nmedical imaging and discuss the future prospects of deep learning-based image\nregistration.\n","authors":["Junyu Chen","Yihao Liu","Shuwen Wei","Zhangxing Bian","Shalini Subramanian","Aaron Carass","Jerry L. Prince","Yong Du"],"pdf_url":"https://arxiv.org/pdf/2307.15615v4.pdf","comment":"Accepted to Medical Image Analysis ((c) MedIA). A list of\n open-sourced code from the papers reviewed has been organized and is\n available at https://bit.ly/3QgFJ9z"},{"id":"http://arxiv.org/abs/2406.08773v3","updated":"2024-11-01T14:55:50Z","published":"2024-06-13T03:05:36Z","title":"DenoiseRep: Denoising Model for Representation Learning","summary":" The denoising model has been proven a powerful generative model but has\nlittle exploration of discriminative tasks. Representation learning is\nimportant in discriminative tasks, which is defined as \"learning\nrepresentations (or features) of the data that make it easier to extract useful\ninformation when building classifiers or other predictors\". In this paper, we\npropose a novel Denoising Model for Representation Learning (DenoiseRep) to\nimprove feature discrimination with joint feature extraction and denoising.\nDenoiseRep views each embedding layer in a backbone as a denoising layer,\nprocessing the cascaded embedding layers as if we are recursively denoise\nfeatures step-by-step. This unifies the frameworks of feature extraction and\ndenoising, where the former progressively embeds features from low-level to\nhigh-level, and the latter recursively denoises features step-by-step. After\nthat, DenoiseRep fuses the parameters of feature extraction and denoising\nlayers, and theoretically demonstrates its equivalence before and after the\nfusion, thus making feature denoising computation-free. DenoiseRep is a\nlabel-free algorithm that incrementally improves features but also\ncomplementary to the label if available. Experimental results on various\ndiscriminative vision tasks, including re-identification (Market-1501,\nDukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet,\nUB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation\n(ADE20K) show stability and impressive improvements. We also validate its\neffectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda)\narchitectures.\n","authors":["Zhengrui Xu","Guan'an Wang","Xiaowen Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2406.08773v3.pdf","comment":"Accepted by NeurIPS 2024,oral"},{"id":"http://arxiv.org/abs/2312.03701v4","updated":"2024-11-01T14:48:57Z","published":"2023-12-06T18:59:31Z","title":"Return of Unconditional Generation: A Self-supervised Representation\n Generation Method","summary":" Unconditional generation -- the problem of modeling data distribution without\nrelying on human-annotated labels -- is a long-standing and fundamental\nchallenge in generative models, creating a potential of learning from\nlarge-scale unlabeled data. In the literature, the generation quality of an\nunconditional method has been much worse than that of its conditional\ncounterpart. This gap can be attributed to the lack of semantic information\nprovided by labels. In this work, we show that one can close this gap by\ngenerating semantic representations in the representation space produced by a\nself-supervised encoder. These representations can be used to condition the\nimage generator. This framework, called Representation-Conditioned Generation\n(RCG), provides an effective solution to the unconditional generation problem\nwithout using labels. Through comprehensive experiments, we observe that RCG\nsignificantly improves unconditional generation quality: e.g., it achieves a\nnew state-of-the-art FID of 2.15 on ImageNet 256x256, largely reducing the\nprevious best of 5.91 by a relative 64%. Our unconditional results are situated\nin the same tier as the leading class-conditional ones. We hope these\nencouraging observations will attract the community's attention to the\nfundamental problem of unconditional generation. Code is available at\nhttps://github.com/LTH14/rcg.\n","authors":["Tianhong Li","Dina Katabi","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2312.03701v4.pdf","comment":"Neurips 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.07955v2","updated":"2024-11-01T14:45:44Z","published":"2023-12-13T08:01:15Z","title":"Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking","summary":" Self-Supervised Learning (SSL) is an effective paradigm for learning\nrepresentations from unlabeled data, such as text, images, and videos. However,\nresearchers have recently found that SSL is vulnerable to backdoor attacks. The\nattacker can embed hidden SSL backdoors via a few poisoned examples in the\ntraining dataset and maliciously manipulate the behavior of downstream models.\nTo defend against SSL backdoor attacks, a feasible route is to detect and\nremove the poisonous samples in the training set. However, the existing SSL\nbackdoor defense method fails to detect the poisonous samples precisely. In\nthis paper, we propose to erase the SSL backdoor by cluster activation masking\nand propose a novel PoisonCAM method. After obtaining the threat model trained\non the poisoned dataset, our method can precisely detect poisonous samples\nbased on the assumption that masking the backdoor trigger can effectively\nchange the activation of a downstream clustering model. In experiments, our\nPoisonCAM achieves 96\\% accuracy for backdoor trigger detection compared to 3\\%\nof the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed\nPoisonCAM significantly improves the performance of the trained SSL model under\nbackdoor attacks compared to the state-of-the-art method. Our code, data, and\ntrained models will be open once this paper is accepted.\n","authors":["Shengsheng Qian","Dizhan Xue","Yifei Wang","Shengjie Zhang","Huaiwen Zhang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11838v3","updated":"2024-11-01T14:45:36Z","published":"2024-06-17T17:59:58Z","title":"Autoregressive Image Generation without Vector Quantization","summary":" Conventional wisdom holds that autoregressive models for image generation are\ntypically accompanied by vector-quantized tokens. We observe that while a\ndiscrete-valued space can facilitate representing a categorical distribution,\nit is not a necessity for autoregressive modeling. In this work, we propose to\nmodel the per-token probability distribution using a diffusion procedure, which\nallows us to apply autoregressive models in a continuous-valued space. Rather\nthan using categorical cross-entropy loss, we define a Diffusion Loss function\nto model the per-token probability. This approach eliminates the need for\ndiscrete-valued tokenizers. We evaluate its effectiveness across a wide range\nof cases, including standard autoregressive models and generalized masked\nautoregressive (MAR) variants. By removing vector quantization, our image\ngenerator achieves strong results while enjoying the speed advantage of\nsequence modeling. We hope this work will motivate the use of autoregressive\ngeneration in other continuous-valued domains and applications. Code is\navailable at: https://github.com/LTH14/mar.\n","authors":["Tianhong Li","Yonglong Tian","He Li","Mingyang Deng","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2406.11838v3.pdf","comment":"Neurips 2024 (Spotlight). Code: https://github.com/LTH14/mar"},{"id":"http://arxiv.org/abs/2407.15794v4","updated":"2024-11-01T14:19:14Z","published":"2024-07-22T16:52:32Z","title":"Disentangling spatio-temporal knowledge for weakly supervised object\n detection and segmentation in surgical video","summary":" Weakly supervised video object segmentation (WSVOS) enables the\nidentification of segmentation maps without requiring an extensive training\ndataset of object masks, relying instead on coarse video labels indicating\nobject presence. Current state-of-the-art methods either require multiple\nindependent stages of processing that employ motion cues or, in the case of\nend-to-end trainable networks, lack in segmentation accuracy, in part due to\nthe difficulty of learning segmentation maps from videos with transient object\npresence. This limits the application of WSVOS for semantic annotation of\nsurgical videos where multiple surgical tools frequently move in and out of the\nfield of view, a problem that is more difficult than typically encountered in\nWSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks\n(VDST-Net), a framework to disentangle spatiotemporal information using\nsemi-decoupled knowledge distillation to predict high-quality class activation\nmaps (CAMs). A teacher network designed to resolve temporal conflicts when\nspecifics about object location and timing in the video are not provided works\nwith a student network that integrates information over time by leveraging\ntemporal dependencies. We demonstrate the efficacy of our framework on a public\nreference dataset and on a more challenging surgical video dataset where\nobjects are, on average, present in less than 60\\% of annotated frames. Our\nmethod outperforms state-of-the-art techniques and generates superior\nsegmentation masks under video-level weak supervision.\n","authors":["Guiqiu Liao","Matjaz Jogan","Sai Koushik","Eric Eaton","Daniel A. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2407.15794v4.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV)"},{"id":"http://arxiv.org/abs/2310.16020v3","updated":"2024-11-01T13:59:05Z","published":"2023-10-24T17:30:26Z","title":"ConvBKI: Real-Time Probabilistic Semantic Mapping Network with\n Quantifiable Uncertainty","summary":" In this paper, we develop a modular neural network for real-time\n{\\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which\nexplicitly updates per-voxel probabilistic distributions within a neural\nnetwork layer. Our approach combines the reliability of classical probabilistic\nalgorithms with the performance and efficiency of modern neural networks.\nAlthough robotic perception is often divided between modern differentiable\nmethods and classical explicit methods, a union of both is necessary for\nreal-time and trustworthy performance. We introduce a novel Convolutional\nBayesian Kernel Inference (ConvBKI) layer which incorporates semantic\nsegmentation predictions online into a 3D map through a depthwise convolution\nlayer by leveraging conjugate priors. We compare ConvBKI against\nstate-of-the-art deep learning approaches and probabilistic algorithms for\nmapping to evaluate reliability and performance. We also create a Robot\nOperating System (ROS) package of ConvBKI and test it on real-world\nperceptually challenging off-road driving data.\n","authors":["Joey Wilson","Yuewei Fu","Joshua Friesen","Parker Ewen","Andrew Capodieci","Paramsothy Jayakumar","Kira Barton","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2310.16020v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.10663"},{"id":"http://arxiv.org/abs/2311.12056v3","updated":"2024-11-01T12:54:28Z","published":"2023-11-18T13:55:05Z","title":"Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal\n satellite dataset for rapid flood mapping","summary":" Global floods, exacerbated by climate change, pose severe threats to human\nlife, infrastructure, and the environment. Recent catastrophic events in\nPakistan and New Zealand underscore the urgent need for precise flood mapping\nto guide restoration efforts, understand vulnerabilities, and prepare for\nfuture occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers\nday-and-night, all-weather imaging capabilities, its application in deep\nlearning for flood segmentation is limited by the lack of large annotated\ndatasets. To address this, we introduce Kuro Siwo, a manually annotated\nmulti-temporal dataset, spanning 43 flood events globally. Our dataset maps\nmore than 338 billion $m^2$ of land, with 33 billion designated as either\nflooded areas or permanent water bodies. Kuro Siwo includes a highly processed\nproduct optimized for flood mapping based on SAR Ground Range Detected, and a\nprimal SAR Single Look Complex product with minimal preprocessing, designed to\npromote research on the exploitation of both the phase and amplitude\ninformation and to offer maximum flexibility for downstream task preprocessing.\nTo leverage advances in large scale self-supervised pretraining methods for\nremote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR\nsamples. Finally, we provide an extensive benchmark, namely BlackBench,\noffering strong baselines for a diverse set of flood events from Europe,\nAmerica, Africa, Asia and Australia.\n","authors":["Nikolaos Ioannis Bountos","Maria Sdraka","Angelos Zavras","Ilektra Karasante","Andreas Karavias","Themistocles Herekakis","Angeliki Thanasou","Dimitrios Michail","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2311.12056v3.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2409.15246v3","updated":"2024-11-01T12:49:19Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems are crucial for cartography, disaster\nsurveillance, and resource administration. Nonetheless, they encounter\nconsiderable obstacles in the processing and transmission of extensive data,\nespecially in specialized domains such as precision agriculture and real-time\ndisaster response. Earth observation satellites, outfitted with remote sensing\ntechnology, gather data from onboard sensors and IoT-enabled terrestrial\nobjects, delivering important information remotely. Domain-adapted Large\nLanguage Models (LLMs) provide a solution by enabling the integration of raw\nand processed EO data. Through domain adaptation, LLMs improve the assimilation\nand analysis of many data sources, tackling the intricacies of specialized\ndatasets in agriculture and disaster response. This data synthesis, directed by\nLLMs, enhances the precision and pertinence of conveyed information. This study\nprovides a thorough examination of using semantic inference and deep learning\nfor sophisticated EO systems. It presents an innovative architecture for\nsemantic communication in EO satellite networks, designed to improve data\ntransmission efficiency using semantic processing methodologies. Recent\nadvancements in onboard processing technologies enable dependable, adaptable,\nand energy-efficient data management in orbit. These improvements guarantee\nreliable performance in adverse space circumstances using radiation-hardened\nand reconfigurable technology. Collectively, these advancements enable\nnext-generation satellite missions with improved processing capabilities,\ncrucial for operational flexibility and real-time decision-making in 6G\nsatellite communication.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v3.pdf","comment":"17 pages, 7 figures, Journal"},{"id":"http://arxiv.org/abs/2405.14864v2","updated":"2024-11-01T12:46:26Z","published":"2024-05-23T17:59:40Z","title":"Video Diffusion Models are Training-free Motion Interpreter and\n Controller","summary":" Video generation primarily aims to model authentic and customized motion\nacross frames, making understanding and controlling the motion a crucial topic.\nMost diffusion-based studies on video motion focus on motion customization with\ntraining-based paradigms, which, however, demands substantial training\nresources and necessitates retraining for diverse models. Crucially, these\napproaches do not explore how video diffusion models encode cross-frame motion\ninformation in their features, lacking interpretability and transparency in\ntheir effectiveness. To answer this question, this paper introduces a novel\nperspective to understand, localize, and manipulate motion-aware features in\nvideo diffusion models. Through analysis using Principal Component Analysis\n(PCA), our work discloses that robust motion-aware feature already exists in\nvideo diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating\ncontent correlation information and filtering motion channels. MOFT provides a\ndistinct set of benefits, including the ability to encode comprehensive motion\ninformation with clear interpretability, extraction without the need for\ntraining, and generalizability across diverse architectures. Leveraging MOFT,\nwe propose a novel training-free video motion control framework. Our method\ndemonstrates competitive performance in generating natural and faithful motion,\nproviding architecture-agnostic insights and applicability in a variety of\ndownstream tasks.\n","authors":["Zeqi Xiao","Yifan Zhou","Shuai Yang","Xingang Pan"],"pdf_url":"https://arxiv.org/pdf/2405.14864v2.pdf","comment":"Project Page: https://xizaoqu.github.io/moft/"},{"id":"http://arxiv.org/abs/2410.20883v2","updated":"2024-11-01T12:42:49Z","published":"2024-10-28T10:04:40Z","title":"Improving Generalization in Visual Reasoning via Self-Ensemble","summary":" The cognitive faculty of visual reasoning necessitates the integration of\nmultimodal perceptual processing and commonsense and external knowledge of the\nworld. In recent years, a plethora of large vision-language models (LVLMs) have\nbeen proposed, demonstrating outstanding power and exceptional proficiency in\ncommonsense reasoning across diverse domains and tasks. Nevertheless, training\nsuch LVLMs requires a lot of costly resources. Recent approaches, instead of\ntraining LVLMs from scratch on various large datasets, focus on exploring ways\nto take advantage of the capabilities of many different LVLMs, such as ensemble\nmethods. In this work, we propose self-ensemble, a novel method that improves\nthe generalization and visual reasoning of the model without updating any\nparameters, a training-free method. Our key insight is that we realized that\nLVLM itself can ensemble without the need for any other LVLMs, which helps to\nunlock their internal capabilities. Extensive experiments on various benchmarks\ndemonstrate the effectiveness of our method in achieving state-of-the-art\n(SOTA) performance on SketchyVQA, Outside Knowledge VQA, and\nout-of-distribution VQA tasks.\n","authors":["Tien-Huy Nguyen","Quang-Khai Tran","Anh-Tuan Quang-Hoang"],"pdf_url":"https://arxiv.org/pdf/2410.20883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23831v2","updated":"2024-11-01T12:11:29Z","published":"2024-10-31T11:21:21Z","title":"FRoundation: Are Foundation Models Ready for Face Recognition?","summary":" Foundation models are predominantly trained in an unsupervised or\nself-supervised manner on highly diverse and large-scale datasets, making them\nbroadly applicable to various downstream tasks. In this work, we investigate\nfor the first time whether such models are suitable for the specific domain of\nface recognition. We further propose and demonstrate the adaptation of these\nmodels for face recognition across different levels of data availability.\nExtensive experiments are conducted on multiple foundation models and datasets\nof varying scales for training and fine-tuning, with evaluation on a wide range\nof benchmarks. Our results indicate that, despite their versatility,\npre-trained foundation models underperform in face recognition compared to\nsimilar architectures trained specifically for this task. However, fine-tuning\nfoundation models yields promising results, often surpassing models trained\nfrom scratch when training data is limited. Even with access to large-scale\nface recognition training datasets, fine-tuned foundation models perform\ncomparably to models trained from scratch, but with lower training\ncomputational costs and without relying on the assumption of extensive data\navailability. Our analysis also explores bias in face recognition, with\nslightly higher bias observed in some settings when using foundation models.\n","authors":["Tahar Chettaoui","Naser Damer","Fadi Boutros"],"pdf_url":"https://arxiv.org/pdf/2410.23831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10188v5","updated":"2024-11-01T10:57:37Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models,\nespecially for long video understanding. We introduce LongVILA, a full-stack\nsolution for long-context visual-language models by co-designing the algorithm\nand system. For model training, we upgrade existing VLMs to support long video\nunderstanding by incorporating two additional stages, i.e., long context\nextension and long video supervised fine-tuning. However, training on long\nvideo is computationally and memory intensive. We introduce the long-context\nMulti-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes\nlong video training and inference, enabling 2M context length training on 256\nGPUs without any gradient checkpointing. LongVILA efficiently extends the\nnumber of video frames of VILA from 8 to 2048, improving the long video\ncaptioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in\n6,000-frame (more than 1 million tokens) video needle-in-a-haystack.\nLongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8%\nwith subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence\nparallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and\ntensor parallelism. Moreover, it seamlessly integrates with Hugging Face\nTransformers.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v5.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2410.20359v2","updated":"2024-11-01T09:33:29Z","published":"2024-10-27T07:25:11Z","title":"Conditional GAN for Enhancing Diffusion Models in Efficient and\n Authentic Global Gesture Generation from Audios","summary":" Audio-driven simultaneous gesture generation is vital for human-computer\ncommunication, AI games, and film production. While previous research has shown\npromise, there are still limitations. Methods based on VAEs are accompanied by\nissues of local jitter and global instability, whereas methods based on\ndiffusion models are hampered by low generation efficiency. This is because the\ndenoising process of DDPM in the latter relies on the assumption that the noise\nadded at each step is sampled from a unimodal distribution, and the noise\nvalues are small. DDIM borrows the idea from the Euler method for solving\ndifferential equations, disrupts the Markov chain process, and increases the\nnoise step size to reduce the number of denoising steps, thereby accelerating\ngeneration. However, simply increasing the step size during the step-by-step\ndenoising process causes the results to gradually deviate from the original\ndata distribution, leading to a significant drop in the quality of the\ngenerated actions and the emergence of unnatural artifacts. In this paper, we\nbreak the assumptions of DDPM and achieves breakthrough progress in denoising\nspeed and fidelity. Specifically, we introduce a conditional GAN to capture\naudio control signals and implicitly match the multimodal denoising\ndistribution between the diffusion and denoising steps within the same sampling\nstep, aiming to sample larger noise values and apply fewer denoising steps for\nhigh-speed generation.\n","authors":["Yongkang Cheng","Mingjiang Liang","Shaoli Huang","Gaoge Han","Jifeng Ning","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.20359v2.pdf","comment":"Accepted by WACV 2025 (Round 1)"},{"id":"http://arxiv.org/abs/2410.20358v2","updated":"2024-11-01T09:20:53Z","published":"2024-10-27T07:19:39Z","title":"RopeTP: Global Human Motion Recovery via Integrating Robust Pose\n Estimation with Diffusion Trajectory Prior","summary":" We present RopeTP, a novel framework that combines Robust pose estimation\nwith a diffusion Trajectory Prior to reconstruct global human motion from\nvideos. At the heart of RopeTP is a hierarchical attention mechanism that\nsignificantly improves context awareness, which is essential for accurately\ninferring the posture of occluded body parts. This is achieved by exploiting\nthe relationships with visible anatomical structures, enhancing the accuracy of\nlocal pose estimations. The improved robustness of these local estimations\nallows for the reconstruction of precise and stable global trajectories.\nAdditionally, RopeTP incorporates a diffusion trajectory model that predicts\nrealistic human motion from local pose sequences. This model ensures that the\ngenerated trajectories are not only consistent with observed local actions but\nalso unfold naturally over time, thereby improving the realism and stability of\n3D human motion reconstruction. Extensive experimental validation shows that\nRopeTP surpasses current methods on two benchmark datasets, particularly\nexcelling in scenarios with occlusions. It also outperforms methods that rely\non SLAM for initial camera estimates and extensive optimization, delivering\nmore accurate and realistic trajectories.\n","authors":["Mingjiang Liang","Yongkang Cheng","Hualin Liang","Shaoli Huang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.20358v2.pdf","comment":"Accepted by WACV 2025 (Round 1)"},{"id":"http://arxiv.org/abs/2402.13629v3","updated":"2024-11-01T08:56:48Z","published":"2024-02-21T09:06:04Z","title":"Adversarial Purification and Fine-tuning for Robust UDC Image\n Restoration","summary":" This study delves into the enhancement of Under-Display Camera (UDC) image\nrestoration models, focusing on their robustness against adversarial attacks.\nDespite its innovative approach to seamless display integration, UDC technology\nfaces unique image degradation challenges exacerbated by the susceptibility to\nadversarial perturbations. Our research initially conducts an in-depth\nrobustness evaluation of deep-learning-based UDC image restoration models by\nemploying several white-box and black-box attacking methods. This evaluation is\npivotal in understanding the vulnerabilities of current UDC image restoration\ntechniques. Following the assessment, we introduce a defense framework\nintegrating adversarial purification with subsequent fine-tuning processes.\nFirst, our approach employs diffusion-based adversarial purification,\neffectively neutralizing adversarial perturbations. Then, we apply the\nfine-tuning methodologies to refine the image restoration models further,\nensuring that the quality and fidelity of the restored images are maintained.\nThe effectiveness of our proposed approach is validated through extensive\nexperiments, showing marked improvements in resilience against typical\nadversarial attacks.\n","authors":["Zhenbo Song","Zhenyuan Zhang","Kaihao Zhang","Zhaoxin Fan","Jianfeng Lu"],"pdf_url":"https://arxiv.org/pdf/2402.13629v3.pdf","comment":"Failure to meet expectations"},{"id":"http://arxiv.org/abs/2410.23629v2","updated":"2024-11-01T08:38:21Z","published":"2024-10-31T04:42:43Z","title":"Posture-Informed Muscular Force Learning for Robust Hand Pressure\n Estimation","summary":" We present PiMForce, a novel framework that enhances hand pressure estimation\nby leveraging 3D hand posture information to augment forearm surface\nelectromyography (sEMG) signals. Our approach utilizes detailed spatial\ninformation from 3D hand poses in conjunction with dynamic muscle activity from\nsEMG to enable accurate and robust whole-hand pressure measurements under\ndiverse hand-object interactions. We also developed a multimodal data\ncollection system that combines a pressure glove, an sEMG armband, and a\nmarkerless finger-tracking module. We created a comprehensive dataset from 21\nparticipants, capturing synchronized data of hand posture, sEMG signals, and\nexerted hand pressure across various hand postures and hand-object interaction\nscenarios using our collection system. Our framework enables precise hand\npressure estimation in complex and natural interaction scenarios. Our approach\nsubstantially mitigates the limitations of traditional sEMG-based or\nvision-based methods by integrating 3D hand posture information with sEMG\nsignals. Video demos, data, and code are available online.\n","authors":["Kyungjin Seo","Junghoon Seo","Hanseok Jeong","Sangpil Kim","Sang Ho Yoon"],"pdf_url":"https://arxiv.org/pdf/2410.23629v2.pdf","comment":"Accepted to NeurIPS 2024. Project Page Link:\n https://pimforce.hcitech.org/"},{"id":"http://arxiv.org/abs/2408.07832v4","updated":"2024-11-01T07:41:04Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23806v2","updated":"2024-11-01T07:25:38Z","published":"2024-10-31T10:46:11Z","title":"Human Action Recognition (HAR) Using Skeleton-based Spatial Temporal\n Relative Transformer Network: ST-RTR","summary":" Human Action Recognition (HAR) is an interesting research area in\nhuman-computer interaction used to monitor the activities of elderly and\ndisabled individuals affected by physical and mental health. In the recent era,\nskeleton-based HAR has received much attention because skeleton data has shown\nthat it can handle changes in striking, body size, camera views, and complex\nbackgrounds. One key characteristic of ST-GCN is automatically learning spatial\nand temporal patterns from skeleton sequences. It has some limitations, as this\nmethod only works for short-range correlation due to its limited receptive\nfield. Consequently, understanding human action requires long-range\ninterconnection. To address this issue, we developed a spatial-temporal\nrelative transformer ST-RTR model. The ST-RTR includes joint and relay nodes,\nwhich allow efficient communication and data transmission within the network.\nThese nodes help to break the inherent spatial and temporal skeleton\ntopologies, which enables the model to understand long-range human action\nbetter. Furthermore, we combine ST-RTR with a fusion model for further\nperformance improvements. To assess the performance of the ST-RTR method, we\nconducted experiments on three skeleton-based HAR benchmarks: NTU RGB+D 60, NTU\nRGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and 1.45% on NTU RGB+D\n60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets, accuracy improved\nby 2.54%. The experimental outcomes explain that the proposed ST-RTR model\nsignificantly improves action recognition associated with the standard ST-GCN\nmethod.\n","authors":["Faisal Mehmood","Enqing Chen","Touqeer Abbas","Samah M. Alzanin"],"pdf_url":"https://arxiv.org/pdf/2410.23806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03507v6","updated":"2024-11-01T07:04:10Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects. Our code will be available at\nhttps://github.com/hoiliu-0801/DQ-DETR.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v6.pdf","comment":"Accepted by ECCV 2024. Our code will be available at\n https://github.com/hoiliu-0801/DQ-DETR"},{"id":"http://arxiv.org/abs/2405.17673v2","updated":"2024-11-01T06:22:30Z","published":"2024-05-27T21:50:16Z","title":"Fast Samplers for Inverse Problems in Iterative Refinement Models","summary":" Constructing fast samplers for unconditional diffusion and flow-matching\nmodels has received much attention recently; however, existing methods for\nsolving inverse problems, such as super-resolution, inpainting, or deblurring,\nstill require hundreds to thousands of iterative steps to obtain high-quality\nresults. We propose a plug-and-play framework for constructing efficient\nsamplers for inverse problems, requiring only pre-trained diffusion or\nflow-matching models. We present Conditional Conjugate Integrators, which\nleverage the specific form of the inverse problem to project the respective\nconditional diffusion/flow dynamics into a more amenable space for sampling.\nOur method complements popular posterior approximation methods for solving\ninverse problems using diffusion/flow models. We evaluate the proposed method's\nperformance on various linear image restoration tasks across multiple datasets,\nemploying diffusion and flow-matching models. Notably, on challenging inverse\nproblems like 4x super-resolution on the ImageNet dataset, our method can\ngenerate high-quality samples in as few as 5 conditional sampling steps and\noutperforms competing baselines requiring 20-1000 steps. Our code will be\npublicly available at https://github.com/mandt-lab/c-pigdm\n","authors":["Kushagra Pandey","Ruihan Yang","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2405.17673v2.pdf","comment":"43 pages, NeurIPS'24 Camera Ready"},{"id":"http://arxiv.org/abs/2405.15677v3","updated":"2024-11-01T06:19:24Z","published":"2024-05-24T16:17:35Z","title":"SMART: Scalable Multi-agent Real-time Motion Generation via Next-token\n Prediction","summary":" Data-driven autonomous driving motion generation tasks are frequently\nimpacted by the limitations of dataset size and the domain gap between\ndatasets, which precludes their extensive application in real-world scenarios.\nTo address this issue, we introduce SMART, a novel autonomous driving motion\ngeneration paradigm that models vectorized map and agent trajectory data into\ndiscrete sequence tokens. These tokens are then processed through a\ndecoder-only transformer architecture to train for the next token prediction\ntask across spatial-temporal series. This GPT-style method allows the model to\nlearn the motion distribution in real driving scenarios. SMART achieves\nstate-of-the-art performance across most of the metrics on the generative Sim\nAgents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset\n(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents\nthe generative model in the autonomous driving motion domain, exhibiting\nzero-shot generalization capabilities: Using only the NuPlan dataset for\ntraining and WOMD for validation, SMART achieved a competitive score of 0.72 on\nthe Sim Agents challenge. Lastly, we have collected over 1 billion motion\ntokens from multiple datasets, validating the model's scalability. These\nresults suggest that SMART has initially emulated two important properties:\nscalability and zero-shot generalization, and preliminarily meets the needs of\nlarge-scale real-time simulation applications. We have released all the code to\npromote the exploration of models for motion generation in the autonomous\ndriving field. The source code is available at\nhttps://github.com/rainmaker22/SMART.\n","authors":["Wei Wu","Xiaoxin Feng","Ziyan Gao","Yuheng Kan"],"pdf_url":"https://arxiv.org/pdf/2405.15677v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.08140v3","updated":"2024-11-01T06:12:07Z","published":"2024-01-16T06:19:18Z","title":"ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Field","summary":" Neural radiance fields (NeRFs) have gained popularity with multiple works\nshowing promising results across various applications. However, to the best of\nour knowledge, existing works do not explicitly model the distribution of\ntraining camera poses, or consequently the triangulation quality, a key factor\naffecting reconstruction quality dating back to classical vision literature. We\nclose this gap with ProvNeRF, an approach that models the \\textbf{provenance}\nfor each point -- i.e., the locations where it is likely visible -- of NeRFs as\na stochastic field. We achieve this by extending implicit maximum likelihood\nestimation (IMLE) to functional space with an optimizable objective. We show\nthat modeling per-point provenance during the NeRF optimization enriches the\nmodel with information on triangulation leading to improvements in novel view\nsynthesis and uncertainty estimation under the challenging sparse,\nunconstrained view setting against competitive baselines.\n","authors":["Kiyohiro Nakayama","Mikaela Angelina Uy","Yang You","Ke Li","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2401.08140v3.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2310.01636v4","updated":"2024-11-01T05:29:34Z","published":"2023-10-02T21:02:23Z","title":"Adaptive Visual Scene Understanding: Incremental Scene Graph Generation","summary":" Scene graph generation (SGG) analyzes images to extract meaningful\ninformation about objects and their relationships. In the dynamic visual world,\nit is crucial for AI systems to continuously detect new objects and establish\ntheir relationships with existing ones. Recently, numerous studies have focused\non continual learning within the domains of object detection and image\nrecognition. However, a limited amount of research focuses on a more\nchallenging continual learning problem in SGG. This increased difficulty arises\nfrom the intricate interactions and dynamic relationships among objects, and\ntheir associated contexts. Thus, in continual learning, SGG models are often\nrequired to expand, modify, retain, and reason scene graphs within the process\nof adaptive visual scene understanding. To systematically explore Continual\nScene Graph Generation (CSEGG), we present a comprehensive benchmark comprising\nthree learning regimes: relationship incremental, scene incremental, and\nrelationship generalization. Moreover, we introduce a ``Replays via Analysis by\nSynthesis\" method named RAS. This approach leverages the scene graphs,\ndecomposes and re-composes them to represent different scenes, and replays the\nsynthesized scenes based on these compositional scene graphs. The replayed\nsynthesized scenes act as a means to practice and refine proficiency in SGG in\nknown and unknown environments. Our experimental results not only highlight the\nchallenges of directly combining existing continual learning methods with SGG\nbackbones but also demonstrate the effectiveness of our proposed approach,\nenhancing CSEGG efficiency while simultaneously preserving privacy and memory\nusage. All data and source code are publicly available online.\n","authors":["Naitik Khandelwal","Xiao Liu","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01636v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03918v2","updated":"2024-11-01T05:23:35Z","published":"2024-10-04T20:45:33Z","title":"STONE: A Submodular Optimization Framework for Active 3D Object\n Detection","summary":" 3D object detection is fundamentally important for various emerging\napplications, including autonomous driving and robotics. A key requirement for\ntraining an accurate 3D object detector is the availability of a large amount\nof LiDAR-based point cloud data. Unfortunately, labeling point cloud data is\nextremely challenging, as accurate 3D bounding boxes and semantic labels are\nrequired for each potential object. This paper proposes a unified active 3D\nobject detection framework, for greatly reducing the labeling cost of training\n3D object detectors. Our framework is based on a novel formulation of\nsubmodular optimization, specifically tailored to the problem of active 3D\nobject detection. In particular, we address two fundamental challenges\nassociated with active 3D object detection: data imbalance and the need to\ncover the distribution of the data, including LiDAR-based point cloud data of\nvarying difficulty levels. Extensive experiments demonstrate that our method\nachieves state-of-the-art performance with high computational efficiency\ncompared to existing active learning methods. The code is available at\nhttps://github.com/RuiyuM/STONE.\n","authors":["Ruiyu Mao","Sarthak Kumar Maharana","Rishabh K Iyer","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2410.03918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00986v2","updated":"2024-11-01T05:03:19Z","published":"2024-04-01T08:18:38Z","title":"Make Continual Learning Stronger via C-Flat","summary":" Model generalization ability upon incrementally acquiring dynamically\nupdating knowledge from sequentially arriving tasks is crucial to tackle the\nsensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape\nsharpness minimization seeking for flat minima lying in neighborhoods with\nuniform low loss or smooth gradient is proven to be a strong training regime\nimproving model generalization compared with loss minimization based optimizer\nlike SGD. Yet only a few works have discussed this training regime for CL,\nproving that dedicated designed zeroth-order sharpness optimizer can improve CL\nperformance. In this work, we propose a Continual Flatness (C-Flat) method\nfeaturing a flatter loss landscape tailored for CL. C-Flat could be easily\ncalled with only one line of code and is plug-and-play to any CL methods. A\ngeneral framework of C-Flat applied to all CL categories and a thorough\ncomparison with loss minima optimizer and flat minima based CL approaches is\npresented in this paper, showing that our method can boost CL performance in\nalmost all cases. Code is available at https://github.com/WanNaa/C-Flat.\n","authors":["Ang Bian","Wei Li","Hangjie Yuan","Chengrong Yu","Mang Wang","Zixiang Zhao","Aojun Lu","Pengliang Ji","Tao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12470v2","updated":"2024-11-01T04:59:31Z","published":"2024-09-19T05:17:44Z","title":"HSIGene: A Foundation Model For Hyperspectral Image Generation","summary":" Hyperspectral image (HSI) plays a vital role in various fields such as\nagriculture and environmental monitoring. However, due to the expensive\nacquisition cost, the number of hyperspectral images is limited, degenerating\nthe performance of downstream tasks. Although some recent studies have\nattempted to employ diffusion models to synthesize HSIs, they still struggle\nwith the scarcity of HSIs, affecting the reliability and diversity of the\ngenerated images. Some studies propose to incorporate multi-modal data to\nenhance spatial diversity, but the spectral fidelity cannot be ensured. In\naddition, existing HSI synthesis models are typically uncontrollable or only\nsupport single-condition control, limiting their ability to generate accurate\nand reliable HSIs. To alleviate these issues, we propose HSIGene, a novel HSI\ngeneration foundation model which is based on latent diffusion and supports\nmulti-condition control, allowing for more precise and reliable HSI generation.\nTo enhance the spatial diversity of the training data while preserving spectral\nfidelity, we propose a new data augmentation method based on spatial\nsuper-resolution, in which HSIs are upscaled first, and thus abundant training\npatches could be obtained by cropping the high-resolution HSIs. In addition, to\nimprove the perceptual quality of the augmented data, we introduce a novel\ntwo-stage HSI super-resolution framework, which first applies RGB bands\nsuper-resolution and then utilizes our proposed Rectangular Guided Attention\nNetwork (RGAN) for guided HSI super-resolution. Experiments demonstrate that\nthe proposed model is capable of generating a vast quantity of realistic HSIs\nfor downstream tasks such as denoising and super-resolution. The code and\nmodels are available at https://github.com/LiPang/HSIGene.\n","authors":["Li Pang","Xiangyong Cao","Datao Tang","Shuang Xu","Xueru Bai","Feng Zhou","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2409.12470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20474v2","updated":"2024-11-01T04:33:52Z","published":"2024-10-27T15:30:45Z","title":"GrounDiT: Grounding Diffusion Transformers via Noisy Patch\n Transplantation","summary":" We introduce GrounDiT, a novel training-free spatial grounding technique for\ntext-to-image generation using Diffusion Transformers (DiT). Spatial grounding\nwith bounding boxes has gained attention for its simplicity and versatility,\nallowing for enhanced user control in image generation. However, prior\ntraining-free approaches often rely on updating the noisy image during the\nreverse diffusion process via backpropagation from custom loss functions, which\nfrequently struggle to provide precise control over individual bounding boxes.\nIn this work, we leverage the flexibility of the Transformer architecture,\ndemonstrating that DiT can generate noisy patches corresponding to each\nbounding box, fully encoding the target object and allowing for fine-grained\ncontrol over each region. Our approach builds on an intriguing property of DiT,\nwhich we refer to as semantic sharing. Due to semantic sharing, when a smaller\npatch is jointly denoised alongside a generatable-size image, the two become\nsemantic clones. Each patch is denoised in its own branch of the generation\nprocess and then transplanted into the corresponding region of the original\nnoisy image at each timestep, resulting in robust spatial grounding for each\nbounding box. In our experiments on the HRS and DrawBench benchmarks, we\nachieve state-of-the-art performance compared to previous training-free\napproaches.\n","authors":["Phillip Y. Lee","Taehoon Yoon","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2410.20474v2.pdf","comment":"Accepted to NeurIPS 2024. Project Page:\n https://groundit-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2410.23775v2","updated":"2024-11-01T03:15:02Z","published":"2024-10-31T09:45:00Z","title":"In-Context LoRA for Diffusion Transformers","summary":" Recent research arXiv:2410.15027 has explored the use of diffusion\ntransformers (DiTs) for task-agnostic image generation by simply concatenating\nattention tokens across images. However, despite substantial computational\nresources, the fidelity of the generated images remains suboptimal. In this\nstudy, we reevaluate and streamline this framework by hypothesizing that\ntext-to-image DiTs inherently possess in-context generation capabilities,\nrequiring only minimal tuning to activate them. Through diverse task\nexperiments, we qualitatively demonstrate that existing text-to-image DiTs can\neffectively perform in-context generation without any tuning. Building on this\ninsight, we propose a remarkably simple pipeline to leverage the in-context\nabilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint\ncaptioning of multiple images, and (3) apply task-specific LoRA tuning using\nsmall datasets (e.g., $20\\sim 100$ samples) instead of full-parameter tuning\nwith large datasets. We name our models In-Context LoRA (IC-LoRA). This\napproach requires no modifications to the original DiT models, only changes to\nthe training data. Remarkably, our pipeline generates high-fidelity image sets\nthat better adhere to prompts. While task-specific in terms of tuning data, our\nframework remains task-agnostic in architecture and pipeline, offering a\npowerful tool for the community and providing valuable insights for further\nresearch on product-level task-agnostic generation systems. We release our\ncode, data, and models at https://github.com/ali-vilab/In-Context-LoRA\n","authors":["Lianghua Huang","Wei Wang","Zhi-Fan Wu","Yupeng Shi","Huanzhang Dou","Chen Liang","Yutong Feng","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23775v2.pdf","comment":"Tech report. Project page:\n https://ali-vilab.github.io/In-Context-LoRA-Page/"},{"id":"http://arxiv.org/abs/2409.17508v2","updated":"2024-11-01T02:38:53Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization in MLLMs, recent advances primarily focus on improving\nthe LLM components, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector in MLLMs. Extensive ablation\nexperiments validate the effectiveness of introducing CMoE under any\nconfiguration, with up to an average 8% performance gains. We further provide\ninterpretation analysis of the tug-of-war problem from the perspective of\ngradient optimization and parameter statistics. Compared to previous\nstate-of-the-art medical MLLMs, Uni-Med achieves competitive or superior\nevaluation metrics on diverse tasks. Code and resources are available at\nhttps://github.com/tsinghua-msiip/Uni-Med.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09553v4","updated":"2024-11-01T02:25:50Z","published":"2024-06-28T08:21:49Z","title":"DPEC: Dual-Path Error Compensation Method for Enhanced Low-Light Image\n Clarity","summary":" For the task of low-light image enhancement, deep learning-based algorithms\nhave demonstrated superiority and effectiveness compared to traditional\nmethods. However, these methods, primarily based on Retinex theory, tend to\noverlook the noise and color distortions in input images, leading to\nsignificant noise amplification and local color distortions in enhanced\nresults. To address these issues, we propose the Dual-Path Error Compensation\n(DPEC) method, designed to improve image quality under low-light conditions by\npreserving local texture details while restoring global image brightness\nwithout amplifying noise. DPEC incorporates precise pixel-level error\nestimation to capture subtle differences and an independent denoising mechanism\nto prevent noise amplification. We introduce the HIS-Retinex loss to guide\nDPEC's training, ensuring the brightness distribution of enhanced images\nclosely aligns with real-world conditions. To balance computational speed and\nresource efficiency while training DPEC for a comprehensive understanding of\nthe global context, we integrated the VMamba architecture into its backbone.\nComprehensive quantitative and qualitative experimental results demonstrate\nthat our algorithm significantly outperforms state-of-the-art methods in\nlow-light image enhancement. The code is publicly available online at\nhttps://github.com/wangshuang233/DPEC.\n","authors":["Shuang Wang","Qianwen Lu","Boxing Peng","Yihe Nie","Qingchuan Tao"],"pdf_url":"https://arxiv.org/pdf/2407.09553v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14135v2","updated":"2024-11-01T02:20:06Z","published":"2024-08-26T09:32:16Z","title":"Foodfusion: A Novel Approach for Food Image Composition via Diffusion\n Models","summary":" Food image composition requires the use of existing dish images and\nbackground images to synthesize a natural new image, while diffusion models\nhave made significant advancements in image generation, enabling the\nconstruction of end-to-end architectures that yield promising results. However,\nexisting diffusion models face challenges in processing and fusing information\nfrom multiple images and lack access to high-quality publicly available\ndatasets, which prevents the application of diffusion models in food image\ncomposition. In this paper, we introduce a large-scale, high-quality food image\ncomposite dataset, FC22k, which comprises 22,000 foreground, background, and\nground truth ternary image pairs. Additionally, we propose a novel food image\ncomposition method, Foodfusion, which leverages the capabilities of the\npre-trained diffusion models and incorporates a Fusion Module for processing\nand integrating foreground and background information. This fused information\naligns the foreground features with the background structure by merging the\nglobal structural information at the cross-attention layer of the denoising\nUNet. To further enhance the content and structure of the background, we also\nintegrate a Content-Structure Control Module. Extensive experiments demonstrate\nthe effectiveness and scalability of our proposed method.\n","authors":["Chaohua Shi","Xuan Wang","Si Shi","Xule Wang","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2408.14135v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2406.18451v3","updated":"2024-11-01T02:13:59Z","published":"2024-06-26T16:00:35Z","title":"Detecting Brittle Decisions for Free: Leveraging Margin Consistency in\n Deep Robust Classifiers","summary":" Despite extensive research on adversarial training strategies to improve\nrobustness, the decisions of even the most robust deep learning models can\nstill be quite sensitive to imperceptible perturbations, creating serious risks\nwhen deploying them for high-stakes real-world applications. While detecting\nsuch cases may be critical, evaluating a model's vulnerability at a\nper-instance level using adversarial attacks is computationally too intensive\nand unsuitable for real-time deployment scenarios. The input space margin is\nthe exact score to detect non-robust samples and is intractable for deep neural\nnetworks. This paper introduces the concept of margin consistency -- a property\nthat links the input space margins and the logit margins in robust models --\nfor efficient detection of vulnerable samples. First, we establish that margin\nconsistency is a necessary and sufficient condition to use a model's logit\nmargin as a score for identifying non-robust samples. Next, through\ncomprehensive empirical analysis of various robustly trained models on CIFAR10\nand CIFAR100 datasets, we show that they indicate high margin consistency with\na strong correlation between their input space margins and the logit margins.\nThen, we show that we can effectively and confidently use the logit margin to\ndetect brittle decisions with such models. Finally, we address cases where the\nmodel is not sufficiently margin-consistent by learning a pseudo-margin from\nthe feature representation. Our findings highlight the potential of leveraging\ndeep representations to assess adversarial vulnerability in deployment\nscenarios efficiently.\n","authors":["Jonas Ngnawé","Sabyasachi Sahoo","Yann Pequignot","Frédéric Precioso","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2406.18451v3.pdf","comment":"10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready"},{"id":"http://arxiv.org/abs/2410.20595v2","updated":"2024-11-01T01:27:10Z","published":"2024-10-27T21:02:37Z","title":"A Framework for Real-Time Volcano-Seismic Event Recognition Based on\n Multi-Station Seismograms and Semantic Segmentation Models","summary":" In volcano monitoring, effective recognition of seismic events is essential\nfor understanding volcanic activity and raising timely warning alerts.\nTraditional methods rely on manual analysis, which can be subjective and\nlabor-intensive. Furthermore, current automatic approaches often tackle\ndetection and classification separately, mostly rely on single station\ninformation and generally require tailored preprocessing and representations to\nperform predictions. These limitations often hinder their application to\nreal-time monitoring and utilization across different volcano conditions. This\nstudy introduces a novel approach that utilizes Semantic Segmentation models to\nautomate seismic event recognition by applying a straight forward\ntransformation of multi-channel 1D signals into 2D representations, enabling\ntheir use as images. Our framework employs a data-driven, end-to-end design\nthat integrates multi-station seismic data with minimal preprocessing,\nperforming both detection and classification simultaneously for five seismic\nevent classes. We evaluated four state-of-the-art segmentation models (UNet,\nUNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events\nrecorded at four different Chilean volcanoes: Nevados del Chill\\'an Volcanic\nComplex, Laguna del Maule, Villarrica and Puyehue-Cord\\'on Caulle. Among these\nmodels, the UNet architecture was identified as the most effective model,\nachieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and\n0.88, respectively, and demonstrating superior noise robustness and model\nflexibility to unseen volcano datasets.\n","authors":["Camilo Espinosa-Curilem","Millaray Curilem","Daniel Basualto"],"pdf_url":"https://arxiv.org/pdf/2410.20595v2.pdf","comment":"10 pages, 9 figures. This is a pre-print, it is currently under\n review for publication"},{"id":"http://arxiv.org/abs/2310.05341v5","updated":"2024-11-01T00:37:44Z","published":"2023-10-09T01:59:49Z","title":"From Question to Exploration: Test-Time Adaptation in Semantic\n Segmentation?","summary":" Test-time adaptation (TTA) aims to adapt a model, initially trained on\ntraining data, to test data with potential distribution shifts. Most existing\nTTA methods focus on classification problems. The pronounced success of\nclassification might lead numerous newcomers and engineers to assume that\nclassic TTA techniques can be directly applied to the more challenging task of\nsemantic segmentation. However, this belief is still an open question. In this\npaper, we investigate the applicability of existing classic TTA strategies in\nsemantic segmentation. Our comprehensive results have led to three key\nobservations. First, the classic normalization updating strategy only brings\nslight performance improvement, and in some cases, it might even adversely\naffect the results. Even with the application of advanced distribution\nestimation techniques like batch renormalization, the problem remains\nunresolved. Second, although the teacher-student scheme does enhance the\ntraining stability for segmentation TTA in the presence of noisy pseudo-labels\nand temporal correlation, it cannot directly result in performance improvement\ncompared to the original model without TTA under complex data distribution.\nThird, segmentation TTA suffers a severe long-tailed class-imbalance problem,\nwhich is substantially more complex than that in TTA for classification. This\nlong-tailed challenge negatively affects segmentation TTA performance, even\nwhen the accuracy of pseudo-labels is high. Besides those observations, we find\nthat visual prompt tuning (VisPT) is promising in segmentation TTA and propose\na novel method named TTAP. The outstanding performance of TTAP has also been\nverified. We hope the community can give more attention to this challenging,\nyet important, segmentation TTA task in the future. The source code is\navailable at: \\textit{https://github.com/ycarobot/TTAP\n","authors":["Chang'an Yi","Haotian Chen","Yifan Zhang","Yonghui Xu","Yan Zhou","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2310.05341v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09371v2","updated":"2024-11-01T00:22:26Z","published":"2024-06-13T17:51:00Z","title":"LRM-Zero: Training Large Reconstruction Models with Synthesized Data","summary":" We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on\nsynthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The\ncore of LRM-Zero is our procedural 3D dataset, Zeroverse, which is\nautomatically synthesized from simple primitive shapes with random texturing\nand augmentations (e.g., height fields, boolean differences, and wireframes).\nUnlike previous 3D datasets (e.g., Objaverse) which are often captured or\ncrafted by humans to approximate real 3D data, Zeroverse completely ignores\nrealistic global semantics but is rich in complex geometric and texture details\nthat are locally similar to or even more intricate than real objects. We\ndemonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse,\ncan achieve high visual quality in the reconstruction of real-world objects,\ncompetitive with models trained on Objaverse. We also analyze several critical\ndesign choices of Zeroverse that contribute to LRM-Zero's capability and\ntraining stability. Our work demonstrates that 3D reconstruction, one of the\ncore tasks in 3D vision, can potentially be addressed without the semantics of\nreal-world objects. The Zeroverse's procedural synthesis code and interactive\nvisualization are available at: https://desaixie.github.io/lrm-zero/.\n","authors":["Desai Xie","Sai Bi","Zhixin Shu","Kai Zhang","Zexiang Xu","Yi Zhou","Sören Pirk","Arie Kaufman","Xin Sun","Hao Tan"],"pdf_url":"https://arxiv.org/pdf/2406.09371v2.pdf","comment":"23 pages, 8 figures. Our code and interactive visualization are\n available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera\n Ready version"},{"id":"http://arxiv.org/abs/2406.17763v2","updated":"2024-11-01T00:08:54Z","published":"2024-06-25T17:48:24Z","title":"DiffusionPDE: Generative PDE-Solving Under Partial Observation","summary":" We introduce a general framework for solving partial differential equations\n(PDEs) using generative diffusion models. In particular, we focus on the\nscenarios where we do not have the full knowledge of the scene necessary to\napply classical solvers. Most existing forward or inverse PDE approaches\nperform poorly when the observations on the data or the underlying coefficients\nare incomplete, which is a common assumption for real-world measurements. In\nthis work, we propose DiffusionPDE that can simultaneously fill in the missing\ninformation and solve a PDE by modeling the joint distribution of the solution\nand coefficient spaces. We show that the learned generative priors lead to a\nversatile framework for accurately solving a wide range of PDEs under partial\nobservation, significantly outperforming the state-of-the-art methods for both\nforward and inverse directions.\n","authors":["Jiahe Huang","Guandao Yang","Zichen Wang","Jeong Joon Park"],"pdf_url":"https://arxiv.org/pdf/2406.17763v2.pdf","comment":"NeurIPS 2024. Project page:\n https://jhhuangchloe.github.io/Diffusion-PDE/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.10691v2","updated":"2024-11-01T14:08:31Z","published":"2024-07-15T13:04:09Z","title":"$\\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific\n Domain through Complementary Granularity","summary":" Recent studies show the growing significance of document retrieval in the\ngeneration of LLMs, i.e., RAG, within the scientific domain by bridging their\nknowledge gap. However, dense retrievers often struggle with domain-specific\nretrieval and complex query-document relationships, particularly when query\nsegments correspond to various parts of a document. To alleviate such prevalent\nchallenges, this paper introduces $\\texttt{MixGR}$, which improves dense\nretrievers' awareness of query-document matching across various levels of\ngranularity in queries and documents using a zero-shot approach.\n$\\texttt{MixGR}$ fuses various metrics based on these granularities to a united\nscore that reflects a comprehensive query-document similarity. Our experiments\ndemonstrate that $\\texttt{MixGR}$ outperforms previous document retrieval by\n24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based\nretrievers, respectively, averaged on queries containing multiple subqueries\nfrom five scientific retrieval datasets. Moreover, the efficacy of two\ndownstream scientific question-answering tasks highlights the advantage of\n$\\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The\ncode and experimental datasets are available.\n","authors":["Fengyu Cai","Xinran Zhao","Tong Chen","Sihao Chen","Hongming Zhang","Iryna Gurevych","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2407.10691v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2408.10159v3","updated":"2024-11-01T03:47:59Z","published":"2024-08-19T17:09:32Z","title":"Customizing Language Models with Instance-wise LoRA for Sequential\n Recommendation","summary":" Sequential recommendation systems predict the next interaction item based on\nusers' past interactions, aligning recommendations with individual preferences.\nLeveraging the strengths of Large Language Models (LLMs) in knowledge\ncomprehension and reasoning, recent approaches are eager to apply LLMs to\nsequential recommendation. A common paradigm is converting user behavior\nsequences into instruction data, and fine-tuning the LLM with\nparameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).\nHowever, the uniform application of LoRA across diverse user behaviors is\ninsufficient to capture individual variability, resulting in negative transfer\nbetween disparate sequences. To address these challenges, we propose\nInstance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation\ntask as a form of multi-task learning, integrating LoRA with the Mixture of\nExperts (MoE) framework. This approach encourages different experts to capture\nvarious aspects of user behavior. Additionally, we introduce a sequence\nrepresentation guided gate function that generates customized expert\nparticipation weights for each user sequence, which allows dynamic parameter\nadjustment for instance-wise recommendations. In sequential recommendation,\niLoRA achieves an average relative improvement of 11.4\\% over basic LoRA in the\nhit ratio metric, with less than a 1\\% relative increase in trainable\nparameters. Extensive experiments on three benchmark datasets demonstrate the\neffectiveness of iLoRA, highlighting its superior performance compared to\nexisting methods in mitigating negative transfer and improving recommendation\naccuracy. Our data and code are available at\nhttps://github.com/AkaliKong/iLoRA.\n","authors":["Xiaoyu Kong","Jiancan Wu","An Zhang","Leheng Sheng","Hui Lin","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2408.10159v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20646v2","updated":"2024-11-01T03:12:44Z","published":"2024-05-31T07:24:42Z","title":"LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential\n Recommendation","summary":" Sequential recommender systems (SRS) aim to predict users' subsequent choices\nbased on their historical interactions and have found applications in diverse\nfields such as e-commerce and social media. However, in real-world systems,\nmost users interact with only a handful of items, while the majority of items\nare seldom consumed. These two issues, known as the long-tail user and\nlong-tail item challenges, often pose difficulties for existing SRS. These\nchallenges can adversely affect user experience and seller benefits, making\nthem crucial to address. Though a few works have addressed the challenges, they\nstill struggle with the seesaw or noisy issues due to the intrinsic scarcity of\ninteractions. The advancements in large language models (LLMs) present a\npromising solution to these problems from a semantic perspective. As one of the\npioneers in this field, we propose the Large Language Models Enhancement\nframework for Sequential Recommendation (LLM-ESR). This framework utilizes\nsemantic embeddings derived from LLMs to enhance SRS without adding extra\ninference load from LLMs. To address the long-tail item challenge, we design a\ndual-view modeling framework that combines semantics from LLMs and\ncollaborative signals from conventional SRS. For the long-tail user challenge,\nwe propose a retrieval augmented self-distillation method to enhance user\npreference representation using more informative interactions from similar\nusers. To verify the effectiveness and versatility of our proposed enhancement\nframework, we conduct extensive experiments on three real-world datasets using\nthree popular SRS models. The results show that our method surpasses existing\nbaselines consistently, and benefits long-tail users and items especially. The\nimplementation code is available at\nhttps://github.com/Applied-Machine-Learning-Lab/LLM-ESR.\n","authors":["Qidong Liu","Xian Wu","Yejing Wang","Zijian Zhang","Feng Tian","Yefeng Zheng","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20646v2.pdf","comment":"accepted by NeruIPS'24 (Spotlight)"},{"id":"http://arxiv.org/abs/2402.15235v3","updated":"2024-11-01T02:00:49Z","published":"2024-02-23T09:57:20Z","title":"MACRec: a Multi-Agent Collaboration Framework for Recommendation","summary":" LLM-based agents have gained considerable attention for their decision-making\nskills and ability to handle complex tasks. Recognizing the current gap in\nleveraging agent capabilities for multi-agent collaboration in recommendation\nsystems, we introduce MACRec, a novel framework designed to enhance\nrecommendation systems through multi-agent collaboration. Unlike existing work\non using agents for user/item simulation, we aim to deploy multi-agents to\ntackle recommendation tasks directly. In our framework, recommendation tasks\nare addressed through the collaborative efforts of various specialized agents,\nincluding Manager, User/Item Analyst, Reflector, Searcher, and Task\nInterpreter, with different working flows. Furthermore, we provide application\nexamples of how developers can easily use MACRec on various recommendation\ntasks, including rating prediction, sequential recommendation, conversational\nrecommendation, and explanation generation of recommendation results. The\nframework and demonstration video are publicly available at\nhttps://github.com/wzf2000/MACRec.\n","authors":["Zhefan Wang","Yuanqing Yu","Wendi Zheng","Weizhi Ma","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15235v3.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2410.23683v2","updated":"2024-11-01T01:21:04Z","published":"2024-10-31T07:19:22Z","title":"Unveiling User Satisfaction and Creator Productivity Trade-Offs in\n Recommendation Platforms","summary":" On User-Generated Content (UGC) platforms, recommendation algorithms\nsignificantly impact creators' motivation to produce content as they compete\nfor algorithmically allocated user traffic. This phenomenon subtly shapes the\nvolume and diversity of the content pool, which is crucial for the platform's\nsustainability. In this work, we demonstrate, both theoretically and\nempirically, that a purely relevance-driven policy with low exploration\nstrength boosts short-term user satisfaction but undermines the long-term\nrichness of the content pool. In contrast, a more aggressive exploration policy\nmay slightly compromise user satisfaction but promote higher content creation\nvolume. Our findings reveal a fundamental trade-off between immediate user\nsatisfaction and overall content production on UGC platforms. Building on this\nfinding, we propose an efficient optimization method to identify the optimal\nexploration strength, balancing user and creator engagement. Our model can\nserve as a pre-deployment audit tool for recommendation algorithms on UGC\nplatforms, helping to align their immediate objectives with sustainable,\nlong-term goals.\n","authors":["Fan Yao","Yiming Liao","Jingzhou Liu","Shaoliang Nie","Qifan Wang","Haifeng Xu","Hongning Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23683v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.18414v2","updated":"2024-11-01T17:47:03Z","published":"2024-07-25T22:12:47Z","title":"Adversarially Robust Decision Transformer","summary":" Decision Transformer (DT), as one of the representative Reinforcement\nLearning via Supervised Learning (RvS) methods, has achieved strong performance\nin offline learning tasks by leveraging the powerful Transformer architecture\nfor sequential decision-making. However, in adversarial environments, these\nmethods can be non-robust, since the return is dependent on the strategies of\nboth the decision-maker and adversary. Training a probabilistic model\nconditioned on observed return to predict action can fail to generalize, as the\ntrajectories that achieve a return in the dataset might have done so due to a\nsuboptimal behavior adversary. To address this, we propose a worst-case-aware\nRvS algorithm, the Adversarially Robust Decision Transformer (ARDT), which\nlearns and conditions the policy on in-sample minimax returns-to-go. ARDT\naligns the target return with the worst-case return learned through minimax\nexpectile regression, thereby enhancing robustness against powerful test-time\nadversaries. In experiments conducted on sequential games with full data\ncoverage, ARDT can generate a maximin (Nash Equilibrium) strategy, the solution\nwith the largest adversarial robustness. In large-scale sequential games and\ncontinuous adversarial RL environments with partial data coverage, ARDT\ndemonstrates significantly superior robustness to powerful test-time\nadversaries and attains higher worst-case returns compared to contemporary DT\nmethods.\n","authors":["Xiaohang Tang","Afonso Marques","Parameswaran Kamalaruban","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2407.18414v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.01306v2","updated":"2024-11-01T17:40:26Z","published":"2024-09-02T14:56:22Z","title":"Highly Accurate Real-space Electron Densities with Neural Networks","summary":" Variational ab-initio methods in quantum chemistry stand out among other\nmethods in providing direct access to the wave function. This allows in\nprinciple straightforward extraction of any other observable of interest,\nbesides the energy, but in practice this extraction is often technically\ndifficult and computationally impractical. Here, we consider the electron\ndensity as a central observable in quantum chemistry and introduce a novel\nmethod to obtain accurate densities from real-space many-electron wave\nfunctions by representing the density with a neural network that captures known\nasymptotic properties and is trained from the wave function by score matching\nand noise-contrastive estimation. We use variational quantum Monte Carlo with\ndeep-learning ans\\\"atze (deep QMC) to obtain highly accurate wave functions\nfree of basis set errors, and from them, using our novel method,\ncorrespondingly accurate electron densities, which we demonstrate by\ncalculating dipole moments, nuclear forces, contact densities, and other\ndensity-based properties.\n","authors":["Lixue Cheng","P. Bernát Szabó","Zeno Schätzle","Derk P. Kooi","Jonas Köhler","Klaas J. H. Giesbertz","Frank Noé","Jan Hermann","Paola Gori-Giorgi","Adam Foster"],"pdf_url":"https://arxiv.org/pdf/2409.01306v2.pdf","comment":"12 pages, 9 figures in the main text"},{"id":"http://arxiv.org/abs/2406.12909v4","updated":"2024-11-01T17:09:52Z","published":"2024-06-12T21:21:42Z","title":"Scalable Training of Trustworthy and Energy-Efficient Predictive Graph\n Foundation Models for Atomistic Materials Modeling: A Case Study with\n HydraGNN","summary":" We present our work on developing and training scalable, trustworthy, and\nenergy-efficient predictive graph foundation models (GFMs) using HydraGNN, a\nmulti-headed graph convolutional neural network architecture. HydraGNN expands\nthe boundaries of graph neural network (GNN) computations in both training\nscale and data diversity. It abstracts over message passing algorithms,\nallowing both reproduction of and comparison across algorithmic innovations\nthat define nearest-neighbor convolution in GNNs. This work discusses a series\nof optimizations that have allowed scaling up the GFMs training to tens of\nthousands of GPUs on datasets consisting of hundreds of millions of graphs. Our\nGFMs use multi-task learning (MTL) to simultaneously learn graph-level and\nnode-level properties of atomistic structures, such as energy and atomic\nforces. Using over 154 million atomistic structures for training, we illustrate\nthe performance of our approach along with the lessons learned on two\nstate-of-the-art United States Department of Energy (US-DOE) supercomputers,\nnamely the Perlmutter petascale system at the National Energy Research\nScientific Computing Center and the Frontier exascale system at Oak Ridge\nLeadership Computing Facility. The HydraGNN architecture enables the GFM to\nachieve near-linear strong scaling performance using more than 2,000 GPUs on\nPerlmutter and 16,000 GPUs on Frontier.\n","authors":["Massimiliano Lupo Pasini","Jong Youl Choi","Kshitij Mehta","Pei Zhang","David Rogers","Jonghyun Bae","Khaled Z. Ibrahim","Ashwin M. Aji","Karl W. Schulz","Jorda Polo","Prasanna Balaprakash"],"pdf_url":"https://arxiv.org/pdf/2406.12909v4.pdf","comment":"51 pages, 32 figures"},{"id":"http://arxiv.org/abs/2406.08401v3","updated":"2024-11-01T17:04:09Z","published":"2024-06-12T16:50:12Z","title":"Nyström Kernel Stein Discrepancy","summary":" Kernel methods underpin many of the most successful approaches in data\nscience and statistics, and they allow representing probability measures as\nelements of a reproducing kernel Hilbert space without loss of information.\nRecently, the kernel Stein discrepancy (KSD), which combines Stein's method\nwith the flexibility of kernel techniques, gained considerable attention.\nThrough the Stein operator, KSD allows the construction of powerful\ngoodness-of-fit tests where it is sufficient to know the target distribution up\nto a multiplicative constant. However, the typical U- and V-statistic-based KSD\nestimators suffer from a quadratic runtime complexity, which hinders their\napplication in large-scale settings. In this work, we propose a Nystr\\\"om-based\nKSD acceleration -- with runtime $\\mathcal O\\left(mn+m^3\\right)$ for $n$\nsamples and $m\\ll n$ Nystr\\\"om points -- , show its $\\sqrt{n}$-consistency with\na classical sub-Gaussian assumption, and demonstrate its applicability for\ngoodness-of-fit testing on a suite of benchmarks.\n","authors":["Florian Kalinke","Zoltan Szabo","Bharath K. Sriperumbudur"],"pdf_url":"https://arxiv.org/pdf/2406.08401v3.pdf","comment":"Broader applicability of main result, consistency of quadratic time\n estimator"},{"id":"http://arxiv.org/abs/2410.19931v2","updated":"2024-11-01T16:54:46Z","published":"2024-10-25T19:07:29Z","title":"Provable optimal transport with transformers: The essence of depth and\n prompt engineering","summary":" Can we establish provable performance guarantees for transformers?\nEstablishing such theoretical guarantees is a milestone in developing\ntrustworthy generative AI. In this paper, we take a step toward addressing this\nquestion by focusing on optimal transport, a fundamental problem at the\nintersection of combinatorial and continuous optimization. Leveraging the\ncomputational power of attention layers, we prove that a transformer with fixed\nparameters can effectively solve the optimal transport problem in Wasserstein-2\nwith entropic regularization for an arbitrary number of points. Consequently,\nthe transformer can sort lists of arbitrary sizes up to an approximation\nfactor. Our results rely on an engineered prompt that enables the transformer\nto implement gradient descent with adaptive stepsizes on the dual optimal\ntransport. Combining the convergence analysis of gradient descent with Sinkhorn\ndynamics, we establish an explicit approximation bound for optimal transport\nwith transformers, which improves as depth increases. Our findings provide\nnovel insights into the essence of prompt engineering and depth for solving\noptimal transport. In particular, prompt engineering boosts the algorithmic\nexpressivity of transformers, allowing them implement an optimization method.\nWith increasing depth, transformers can simulate several iterations of gradient\ndescent.\n","authors":["Hadi Daneshmand"],"pdf_url":"https://arxiv.org/pdf/2410.19931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22367v2","updated":"2024-11-01T16:53:58Z","published":"2024-10-28T20:45:52Z","title":"MAMMAL -- Molecular Aligned Multi-Modal Architecture and Language","summary":" Drug discovery typically consists of multiple steps, including identifying a\ntarget protein key to a disease's etiology, validating that interacting with\nthis target could prevent symptoms or cure the disease, discovering a small\nmolecule or biologic therapeutic to interact with it, and optimizing the\ncandidate molecule through a complex landscape of required properties. Drug\ndiscovery related tasks often involve prediction and generation while\nconsidering multiple entities that potentially interact, which poses a\nchallenge for typical AI models. For this purpose we present MAMMAL - Molecular\nAligned Multi-Modal Architecture and Language - a method that we applied to\ncreate a versatile multi-task multi-align foundation model that learns from\nlarge-scale biological datasets (2 billion samples) across diverse modalities,\nincluding proteins, small molecules, and genes. We introduce a prompt syntax\nthat supports a wide range of classification, regression, and generation tasks.\nIt allows combining different modalities and entity types as inputs and/or\noutputs. Our model handles combinations of tokens and scalars and enables the\ngeneration of small molecules and proteins, property prediction, and\ntranscriptomic lab test predictions. We evaluated the model on 11 diverse\ndownstream tasks spanning different steps within a typical drug discovery\npipeline, where it reaches new SOTA in 9 tasks and is comparable to SOTA in 2\ntasks. This performance is achieved while using a unified architecture serving\nall tasks, in contrast to the original SOTA performance achieved using tailored\narchitectures.\n The model code and pretrained weights are publicly available at\nhttps://github.com/BiomedSciAI/biomed-multi-alignment and\nhttps://huggingface.co/ibm/biomed.omics.bl.sm.ma-ted-458m.\n","authors":["Yoel Shoshan","Moshiko Raboh","Michal Ozery-Flato","Vadim Ratner","Alex Golts","Jeffrey K. Weber","Ella Barkan","Simona Rabinovici-Cohen","Sagi Polaczek","Ido Amos","Ben Shapira","Liam Hazan","Matan Ninio","Sivan Ravid","Michael M. Danziger","Joseph A. Morrone","Parthasarathy Suryanarayanan","Michal Rosen-Zvi","Efrat Hexter"],"pdf_url":"https://arxiv.org/pdf/2410.22367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19153v2","updated":"2024-11-01T16:47:59Z","published":"2024-05-29T14:59:49Z","title":"A Study of Plasticity Loss in On-Policy Deep Reinforcement Learning","summary":" Continual learning with deep neural networks presents challenges distinct\nfrom both the fixed-dataset and convex continual learning regimes. One such\nchallenge is plasticity loss, wherein a neural network trained in an online\nfashion displays a degraded ability to fit new tasks. This problem has been\nextensively studied in both supervised learning and off-policy reinforcement\nlearning (RL), where a number of remedies have been proposed. Still, plasticity\nloss has received less attention in the on-policy deep RL setting. Here we\nperform an extensive set of experiments examining plasticity loss and a variety\nof mitigation methods in on-policy deep RL. We demonstrate that plasticity loss\nis pervasive under domain shift in this regime, and that a number of methods\ndeveloped to resolve it in other settings fail, sometimes even performing worse\nthan applying no intervention at all. In contrast, we find that a class of\n``regenerative'' methods are able to consistently mitigate plasticity loss in a\nvariety of contexts, including in gridworld tasks and more challenging\nenvironments like Montezuma's Revenge and ProcGen.\n","authors":["Arthur Juliani","Jordan T. Ash"],"pdf_url":"https://arxiv.org/pdf/2405.19153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12032v2","updated":"2024-11-01T16:46:49Z","published":"2023-09-21T12:53:45Z","title":"Human-in-the-Loop Causal Discovery under Latent Confounding using\n Ancestral GFlowNets","summary":" Structure learning is the crux of causal inference. Notably, causal discovery\n(CD) algorithms are brittle when data is scarce, possibly inferring imprecise\ncausal relations that contradict expert knowledge -- especially when\nconsidering latent confounders. To aggravate the issue, most CD methods do not\nprovide uncertainty estimates, making it hard for users to interpret results\nand improve the inference process. Surprisingly, while CD is a human-centered\naffair, no works have focused on building methods that both 1) output\nuncertainty estimates that can be verified by experts and 2) interact with\nthose experts to iteratively refine CD. To solve these issues, we start by\nproposing to sample (causal) ancestral graphs proportionally to a belief\ndistribution based on a score function, such as the Bayesian information\ncriterion (BIC), using generative flow networks. Then, we leverage the\ndiversity in candidate graphs and introduce an optimal experimental design to\niteratively probe the expert about the relations among variables, effectively\nreducing the uncertainty of our belief over ancestral graphs. Finally, we\nupdate our samples to incorporate human feedback via importance sampling.\nImportantly, our method does not require causal sufficiency (i.e., unobserved\nconfounders may exist). Experiments with synthetic observational data show that\nour method can accurately sample from distributions over ancestral graphs and\nthat we can greatly improve inference quality with human aid.\n","authors":["Tiago da Silva","Eliezer Silva","António Góis","Dominik Heider","Samuel Kaski","Diego Mesquita","Adèle Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2309.12032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15589v3","updated":"2024-11-01T16:39:36Z","published":"2024-05-24T14:20:09Z","title":"Efficient Adversarial Training in LLMs with Continuous Attacks","summary":" Large language models (LLMs) are vulnerable to adversarial attacks that can\nbypass their safety guardrails. In many domains, adversarial training has\nproven to be one of the most promising methods to reliably improve robustness\nagainst such attacks. Yet, in the context of LLMs, current methods for\nadversarial training are hindered by the high computational costs required to\nperform discrete adversarial attacks at each training iteration. We address\nthis problem by instead calculating adversarial attacks in the continuous\nembedding space of the LLM, which is orders of magnitudes more efficient. We\npropose a fast adversarial training algorithm (C-AdvUL) composed of two losses:\nthe first makes the model robust on continuous embedding attacks computed on an\nadversarial behaviour dataset; the second ensures the usefulness of the final\nmodel by fine-tuning on utility data. Moreover, we introduce C-AdvIPO, an\nadversarial variant of IPO that does not require utility data for adversarially\nrobust alignment. Our empirical evaluation on five models from different\nfamilies (Gemma, Phi3, Mistral, Zephyr, Llama2) and at different scales (2B,\n3.8B, 7B) shows that both algorithms substantially enhance LLM robustness\nagainst discrete attacks (GCG, AutoDAN, PAIR), while maintaining utility. Our\nresults demonstrate that robustness to continuous perturbations can extrapolate\nto discrete threat models. Thereby, we present a path toward scalable\nadversarial training algorithms for robustly aligning LLMs.\n","authors":["Sophie Xhonneux","Alessandro Sordoni","Stephan Günnemann","Gauthier Gidel","Leo Schwinn"],"pdf_url":"https://arxiv.org/pdf/2405.15589v3.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.16121v2","updated":"2024-11-01T16:30:00Z","published":"2024-06-23T14:24:14Z","title":"Diffusion Spectral Representation for Reinforcement Learning","summary":" Diffusion-based models have achieved notable empirical successes in\nreinforcement learning (RL) due to their expressiveness in modeling complex\ndistributions. Despite existing methods being promising, the key challenge of\nextending existing methods for broader real-world applications lies in the\ncomputational cost at inference time, i.e., sampling from a diffusion model is\nconsiderably slow as it often requires tens to hundreds of iterations to\ngenerate even one sample. To circumvent this issue, we propose to leverage the\nflexibility of diffusion models for RL from a representation learning\nperspective. In particular, by exploiting the connection between diffusion\nmodels and energy-based models, we develop Diffusion Spectral Representation\n(Diff-SR), a coherent algorithm framework that enables extracting sufficient\nrepresentations for value functions in Markov decision processes (MDP) and\npartially observable Markov decision processes (POMDP). We further demonstrate\nhow Diff-SR facilitates efficient policy optimization and practical algorithms\nwhile explicitly bypassing the difficulty and inference cost of sampling from\nthe diffusion model. Finally, we provide comprehensive empirical studies to\nverify the benefits of Diff-SR in delivering robust and advantageous\nperformance across various benchmarks with both fully and partially observable\nsettings.\n","authors":["Dmitry Shribak","Chen-Xiao Gao","Yitong Li","Chenjun Xiao","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2406.16121v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.02234v2","updated":"2024-11-01T16:22:33Z","published":"2024-06-04T11:56:19Z","title":"On the Limitations of Fractal Dimension as a Measure of Generalization","summary":" Bounding and predicting the generalization gap of overparameterized neural\nnetworks remains a central open problem in theoretical machine learning. There\nis a recent and growing body of literature that proposes the framework of\nfractals to model optimization trajectories of neural networks, motivating\ngeneralization bounds and measures based on the fractal dimension of the\ntrajectory. Notably, the persistent homology dimension has been proposed to\ncorrelate with the generalization gap. This paper performs an empirical\nevaluation of these persistent homology-based generalization measures, with an\nin-depth statistical analysis. Our study reveals confounding effects in the\nobserved correlation between generalization and topological measures due to the\nvariation of hyperparameters. We also observe that fractal dimension fails to\npredict generalization of models trained from poor initializations. We lastly\nreveal the intriguing manifestation of model-wise double descent in these\ntopological generalization measures. Our work forms a basis for a deeper\ninvestigation of the causal relationships between fractal geometry, topological\ndata analysis, and neural network optimization.\n","authors":["Charlie B. Tan","Inés García-Redondo","Qiquan Wang","Michael M. Bronstein","Anthea Monod"],"pdf_url":"https://arxiv.org/pdf/2406.02234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24198v2","updated":"2024-11-01T16:06:10Z","published":"2024-10-31T17:55:13Z","title":"SelfCodeAlign: Self-Alignment for Code Generation","summary":" Instruction tuning is a supervised fine-tuning approach that significantly\nimproves the ability of large language models (LLMs) to follow human\ninstructions. We propose SelfCodeAlign, the first fully transparent and\npermissive pipeline for self-aligning code LLMs without extensive human\nannotations or distillation. SelfCodeAlign employs the same base model for\ninference throughout the data generation process. It first extracts diverse\ncoding concepts from high-quality seed snippets to generate new tasks. It then\nsamples multiple responses per task, pairs each with test cases, and validates\nthem in a sandbox environment. Finally, passing examples are selected for\ninstruction tuning. In our primary experiments, we use SelfCodeAlign with\nCodeQwen1.5-7B to generate a dataset of 74k instruction-response pairs.\nFinetuning on this dataset leads to a model that achieves a 67.1 pass@1 on\nHumanEval+, surpassing CodeLlama-70B-Instruct despite being ten times smaller.\nAcross all benchmarks, this finetuned model consistently outperforms the\noriginal version trained with OctoPack, the previous state-of-the-art method\nfor instruction tuning without human annotations or distillation. Additionally,\nwe show that SelfCodeAlign is effective across LLMs of various sizes, from 3B\nto 33B, and that the base models can benefit more from alignment with their own\ndata distribution. We further validate each component's effectiveness in our\npipeline, showing that SelfCodeAlign outperforms both direct distillation from\nGPT-4o and leading GPT-3.5-based distillation methods, such as OSS-Instruct and\nEvol-Instruct. SelfCodeAlign has also led to the creation of\nStarCoder2-Instruct, the first fully transparent, permissively licensed, and\nself-aligned code LLM that achieves state-of-the-art coding performance.\n","authors":["Yuxiang Wei","Federico Cassano","Jiawei Liu","Yifeng Ding","Naman Jain","Zachary Mueller","Harm de Vries","Leandro von Werra","Arjun Guha","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24198v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.05678v3","updated":"2024-11-01T16:03:23Z","published":"2024-04-08T16:57:44Z","title":"Flexible Fairness-Aware Learning via Inverse Conditional Permutation","summary":" Equalized odds, as a popular notion of algorithmic fairness, aims to ensure\nthat sensitive variables, such as race and gender, do not unfairly influence\nthe algorithm's prediction when conditioning on the true outcome. Despite rapid\nadvancements, current research primarily focuses on equalized odds violations\ncaused by a single sensitive attribute, leaving the challenge of simultaneously\naccounting for multiple attributes largely unaddressed. We bridge this gap by\nintroducing an in-processing fairness-aware learning approach, FairICP, which\nintegrates adversarial learning with a novel inverse conditional permutation\nscheme. FairICP offers a theoretically justified, flexible, and efficient\nscheme to promote equalized odds under fairness conditions described by complex\nand multidimensional sensitive attributes. The efficacy and adaptability of our\nmethod are demonstrated through both simulation studies and empirical analyses\nof real-world datasets.\n","authors":["Yuheng Lai","Leying Guan"],"pdf_url":"https://arxiv.org/pdf/2404.05678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17835v2","updated":"2024-11-01T15:55:34Z","published":"2024-01-31T13:52:11Z","title":"Simplifying Latent Dynamics with Softly State-Invariant World Models","summary":" To solve control problems via model-based reasoning or planning, an agent\nneeds to know how its actions affect the state of the world. The actions an\nagent has at its disposal often change the state of the environment in\nsystematic ways. However, existing techniques for world modelling do not\nguarantee that the effect of actions are represented in such systematic ways.\nWe introduce the Parsimonious Latent Space Model (PLSM), a world model that\nregularizes the latent dynamics to make the effect of the agent's actions more\npredictable. Our approach minimizes the mutual information between latent\nstates and the change that an action produces in the agent's latent state, in\nturn minimizing the dependence the state has on the dynamics. This makes the\nworld model softly state-invariant. We combine PLSM with different model\nclasses used for i) future latent state prediction, ii) planning, and iii)\nmodel-free reinforcement learning. We find that our regularization improves\naccuracy, generalization, and performance in downstream tasks, highlighting the\nimportance of systematic treatment of actions in world models.\n","authors":["Tankred Saanum","Peter Dayan","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2401.17835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01684v2","updated":"2024-11-01T15:46:35Z","published":"2023-10-02T22:42:52Z","title":"Designing User-Centric Behavioral Interventions to Prevent Dysglycemia\n with Novel Counterfactual Explanations","summary":" Monitoring unexpected health events and taking actionable measures to avert\nthem beforehand is central to maintaining health and preventing disease.\nTherefore, a tool capable of predicting adverse health events and offering\nusers actionable feedback about how to make changes in their diet, exercise,\nand medication to prevent abnormal health events could have significant\nsocietal impacts. Counterfactual explanations can provide insights into why a\nmodel made a particular prediction by generating hypothetical instances that\nare similar to the original input but lead to a different prediction outcome.\nTherefore, counterfactuals can be viewed as a means to design AI-driven health\ninterventions to not only predict but also prevent adverse health outcomes such\nas blood glucose spikes, diabetes, and heart disease. In this paper, we design\n\\textit{\\textbf{ExAct}}, a novel model-agnostic framework for generating\ncounterfactual explanations for chronic disease prevention and management.\nLeveraging insights from adversarial learning, ExAct characterizes the decision\nboundary for high-dimensional data and performs a grid search to generate\nactionable interventions. ExAct is unique in integrating prior knowledge about\nuser preferences of feasible explanations into the process of counterfactual\ngeneration. ExAct is evaluated extensively using four real-world datasets and\nexternal simulators. With $82.8\\%$ average validity in the simulation-aided\nvalidation, ExAct surpasses the state-of-the-art techniques for generating\ncounterfactual explanations by at least $10\\%$. Besides, counterfactuals from\nExAct exhibit at least $6.6\\%$ improved proximity compared to previous\nresearch.\n","authors":["Asiful Arefeen","Hassan Ghasemzadeh"],"pdf_url":"https://arxiv.org/pdf/2310.01684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03901v2","updated":"2024-11-01T15:19:18Z","published":"2024-10-04T20:08:24Z","title":"Improving Node Representation by Boosting Target-Aware Contrastive Loss","summary":" Graphs model complex relationships between entities, with nodes and edges\ncapturing intricate connections. Node representation learning involves\ntransforming nodes into low-dimensional embeddings. These embeddings are\ntypically used as features for downstream tasks. Therefore, their quality has a\nsignificant impact on task performance. Existing approaches for node\nrepresentation learning span (semi-)supervised, unsupervised, and\nself-supervised paradigms. In graph domains, (semi-)supervised learning often\nonly optimizes models based on class labels, neglecting other abundant graph\nsignals, which limits generalization. While self-supervised or unsupervised\nlearning produces representations that better capture underlying graph signals,\nthe usefulness of these captured signals for downstream target tasks can vary.\nTo bridge this gap, we introduce Target-Aware Contrastive Learning\n(Target-aware CL) which aims to enhance target task performance by maximizing\nthe mutual information between the target task and node representations with a\nself-supervised learning process. This is achieved through a sampling function,\nXGBoost Sampler (XGSampler), to sample proper positive examples for the\nproposed Target-Aware Contrastive Loss (XTCL). By minimizing XTCL, Target-aware\nCL increases the mutual information between the target task and node\nrepresentations, such that model generalization is improved. Additionally,\nXGSampler enhances the interpretability of each signal by showing the weights\nfor sampling the proper positive examples. We show experimentally that XTCL\nsignificantly improves the performance on two target tasks: node classification\nand link prediction tasks, compared to state-of-the-art models.\n","authors":["Ying-Chun Lin","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2410.03901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15699v2","updated":"2024-11-01T15:13:19Z","published":"2024-05-24T16:43:26Z","title":"Dimension-free deterministic equivalents for random feature regression","summary":" In this work we investigate the generalization performance of random feature\nridge regression (RFRR). Our main contribution is a general deterministic\nequivalent for the test error of RFRR. Specifically, under a certain\nconcentration property, we show that the test error is well approximated by a\nclosed-form expression that only depends on the feature map eigenvalues.\nNotably, our approximation guarantee is non-asymptotic, multiplicative, and\nindependent of the feature map dimension -- allowing for infinite-dimensional\nfeatures. We expect this deterministic equivalent to hold broadly beyond our\ntheoretical analysis, and we empirically validate its predictions on various\nreal and synthetic datasets. As an application, we derive sharp excess error\nrates under standard power-law assumptions of the spectrum and target decay. In\nparticular, we provide a tight result for the smallest number of features\nachieving optimal minimax error rate.\n","authors":["Leonardo Defilippis","Bruno Loureiro","Theodor Misiakiewicz"],"pdf_url":"https://arxiv.org/pdf/2405.15699v2.pdf","comment":"NeurIPS 2024 camera-ready version"},{"id":"http://arxiv.org/abs/2410.22283v2","updated":"2024-11-01T15:00:44Z","published":"2024-10-29T17:36:10Z","title":"Leveraging Recurrent Neural Networks for Predicting Motor Movements from\n Primate Motor Cortex Neural Recordings","summary":" This paper presents an efficient deep learning solution for decoding motor\nmovements from neural recordings in non-human primates. An Autoencoder Gated\nRecurrent Unit (AEGRU) model was adopted as the model architecture for this\ntask. The autoencoder is only used during the training stage to achieve better\ngeneralization. Together with the preprocessing techniques, our model achieved\n0.71 $R^2$ score, surpassing the baseline models in Neurobench and is ranked\nfirst for $R^2$ in the IEEE BioCAS 2024 Grand Challenge on Neural Decoding.\nModel pruning is also applied leading to a reduction of 41.4% of the\nmultiply-accumulate (MAC) operations with little change in the $R^2$ score\ncompared to the unpruned model.\n","authors":["Yuanxi Wang","Zuowen Wang","Shih-Chii Liu"],"pdf_url":"https://arxiv.org/pdf/2410.22283v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14940v3","updated":"2024-11-01T14:49:44Z","published":"2024-10-19T02:07:33Z","title":"Nova: A Practical and Advanced Alignment","summary":" We introduce Nova, a suite of practical alignment techniques employed in a\nseries of empirically validated high-performing models. This represents the\nfirst comprehensive account of alignment methodologies, offering valuable\ninsights for advancing AI research. We investigate the critical components that\nenhance model performance during the alignment process, including optimization\nmethods, data strategies, capability enhancements, and evaluation processes.\nThe process spans three key stages: Prompt Augmentation System(PAS), Supervised\nFine-Tuning(SFT), and Preference Alignment. The problems encountered, the\nsolutions applied, and the improvements made are thoroughly recorded.\n Through comparisons across well-established benchmarks, we highlight the\ntechnological advancements enabled by Nova Alignment. Importantly,\nQwen2-Nova-72B and Llama3-PBM-Nova-70B are instruct versions of the Qwen2-72B\nand Llama-3-70B base models, optimized through Nova. The Nova models show\nsignificant core improvements, with user experience gains of 17% to 28%, and\nexcels on specialized benchmarks. In open-source benchmark evaluations, both\nQwen2-Nova-72B and Llama3-PBM-Nova-70B consistently outperform their respective\nofficial instruct versions across nearly all datasets. This report aims to\nclarify the key technologies behind the alignment process, fostering a deeper\nunderstanding within the community. Llama3-PBM-Nova-70B model is available at\nhttps://huggingface.co/PKU-Baichuan-MLSystemLab/Llama3-PBM-Nova-70B.\n","authors":["Mingan Lin","Fan Yang","Yanjun Shen","Haoze Sun","Tianpeng Li","Tao Zhang","Chenzheng Zhu","Tao Zhang","Miao Zheng","Xu Li","Yijie Zhou","Mingyang Chen","Yanzhao Qin","Youquan Li","Hao Liang","Fei Li","Yadong Li","Mang Wang","Guosheng Dong","Kun Fang","Jianhua Xu","Bin Cui","Wentao Zhang","Zenan Zhou","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2410.14940v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07955v2","updated":"2024-11-01T14:45:44Z","published":"2023-12-13T08:01:15Z","title":"Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking","summary":" Self-Supervised Learning (SSL) is an effective paradigm for learning\nrepresentations from unlabeled data, such as text, images, and videos. However,\nresearchers have recently found that SSL is vulnerable to backdoor attacks. The\nattacker can embed hidden SSL backdoors via a few poisoned examples in the\ntraining dataset and maliciously manipulate the behavior of downstream models.\nTo defend against SSL backdoor attacks, a feasible route is to detect and\nremove the poisonous samples in the training set. However, the existing SSL\nbackdoor defense method fails to detect the poisonous samples precisely. In\nthis paper, we propose to erase the SSL backdoor by cluster activation masking\nand propose a novel PoisonCAM method. After obtaining the threat model trained\non the poisoned dataset, our method can precisely detect poisonous samples\nbased on the assumption that masking the backdoor trigger can effectively\nchange the activation of a downstream clustering model. In experiments, our\nPoisonCAM achieves 96\\% accuracy for backdoor trigger detection compared to 3\\%\nof the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed\nPoisonCAM significantly improves the performance of the trained SSL model under\nbackdoor attacks compared to the state-of-the-art method. Our code, data, and\ntrained models will be open once this paper is accepted.\n","authors":["Shengsheng Qian","Dizhan Xue","Yifei Wang","Shengjie Zhang","Huaiwen Zhang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02552v2","updated":"2024-11-01T14:44:44Z","published":"2024-02-04T15:54:37Z","title":"Neur2BiLO: Neural Bilevel Optimization","summary":" Bilevel optimization deals with nested problems in which a leader takes the\nfirst decision to minimize their objective function while accounting for a\nfollower's best-response reaction. Constrained bilevel problems with integer\nvariables are particularly notorious for their hardness. While exact solvers\nhave been proposed for mixed-integer linear bilevel optimization, they tend to\nscale poorly with problem size and are hard to generalize to the non-linear\ncase. On the other hand, problem-specific algorithms (exact and heuristic) are\nlimited in scope. Under a data-driven setting in which similar instances of a\nbilevel problem are solved routinely, our proposed framework, Neur2BiLO, embeds\na neural network approximation of the leader's or follower's value function,\ntrained via supervised regression, into an easy-to-solve mixed-integer program.\nNeur2BiLO serves as a heuristic that produces high-quality solutions extremely\nfast for four applications with linear and non-linear objectives and pure and\nmixed-integer variables.\n","authors":["Justin Dumouchelle","Esther Julien","Jannis Kurtz","Elias B. Khalil"],"pdf_url":"https://arxiv.org/pdf/2402.02552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00132v2","updated":"2024-11-01T14:36:49Z","published":"2024-05-31T18:47:30Z","title":"QuanTA: Efficient High-Rank Fine-Tuning of LLMs with Quantum-Informed\n Tensor Adaptation","summary":" We propose Quantum-informed Tensor Adaptation (QuanTA), a novel,\neasy-to-implement, fine-tuning method with no inference overhead for\nlarge-scale pre-trained language models. By leveraging quantum-inspired methods\nderived from quantum circuit structures, QuanTA enables efficient high-rank\nfine-tuning, surpassing the limitations of Low-Rank Adaptation (LoRA)--low-rank\napproximation may fail for complicated downstream tasks. Our approach is\ntheoretically supported by the universality theorem and the rank representation\ntheorem to achieve efficient high-rank adaptations. Experiments demonstrate\nthat QuanTA significantly enhances commonsense reasoning, arithmetic reasoning,\nand scalability compared to traditional methods. Furthermore, QuanTA shows\nsuperior performance with fewer trainable parameters compared to other\napproaches and can be designed to integrate with existing fine-tuning\nalgorithms for further improvement, providing a scalable and efficient solution\nfor fine-tuning large language models and advancing state-of-the-art in natural\nlanguage processing.\n","authors":["Zhuo Chen","Rumen Dangovski","Charlotte Loh","Owen Dugan","Di Luo","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2406.00132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17423v3","updated":"2024-11-01T14:32:12Z","published":"2024-02-27T11:32:14Z","title":"Reinforced In-Context Black-Box Optimization","summary":" Black-Box Optimization (BBO) has found successful applications in many fields\nof science and engineering. Recently, there has been a growing interest in\nmeta-learning particular components of BBO algorithms to speed up optimization\nand get rid of tedious hand-crafted heuristics. As an extension, learning the\nentire algorithm from data requires the least labor from experts and can\nprovide the most flexibility. In this paper, we propose RIBBO, a method to\nreinforce-learn a BBO algorithm from offline data in an end-to-end fashion.\nRIBBO employs expressive sequence models to learn the optimization histories\nproduced by multiple behavior algorithms and tasks, leveraging the in-context\nlearning ability of large models to extract task information and make decisions\naccordingly. Central to our method is to augment the optimization histories\nwith \\textit{regret-to-go} tokens, which are designed to represent the\nperformance of an algorithm based on cumulative regret over the future part of\nthe histories. The integration of regret-to-go tokens enables RIBBO to\nautomatically generate sequences of query points that satisfy the user-desired\nregret, which is verified by its universally good empirical performance on\ndiverse problems, including BBO benchmark functions, hyper-parameter\noptimization and robot control problems.\n","authors":["Lei Song","Chenxiao Gao","Ke Xue","Chenyang Wu","Dong Li","Jianye Hao","Zongzhang Zhang","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2402.17423v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18947v3","updated":"2024-11-01T13:53:44Z","published":"2024-04-27T07:22:28Z","title":"Multimodal Fusion on Low-quality Data: A Comprehensive Survey","summary":" Multimodal fusion focuses on integrating information from multiple modalities\nwith the goal of more accurate prediction, which has achieved remarkable\nprogress in a wide range of scenarios, including autonomous driving and medical\ndiagnosis. However, the reliability of multimodal fusion remains largely\nunexplored especially under low-quality data settings. This paper surveys the\ncommon challenges and recent advances of multimodal fusion in the wild and\npresents them in a comprehensive taxonomy. From a data-centric view, we\nidentify four main challenges that are faced by multimodal fusion on\nlow-quality data, namely (1) noisy multimodal data that are contaminated with\nheterogeneous noises, (2) incomplete multimodal data that some modalities are\nmissing, (3) imbalanced multimodal data that the qualities or properties of\ndifferent modalities are significantly different and (4) quality-varying\nmultimodal data that the quality of each modality dynamically changes with\nrespect to different samples. This new taxonomy will enable researchers to\nunderstand the state of the field and identify several potential directions. We\nalso provide discussion for the open problems in this field together with\ninteresting future research directions.\n","authors":["Qingyang Zhang","Yake Wei","Zongbo Han","Huazhu Fu","Xi Peng","Cheng Deng","Qinghua Hu","Cai Xu","Jie Wen","Di Hu","Changqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18947v3.pdf","comment":"Feel free to comment on our manuscript: qingyangzhang@tju$.$edu$.$cn"},{"id":"http://arxiv.org/abs/2402.13622v2","updated":"2024-11-01T13:33:58Z","published":"2024-02-21T08:50:33Z","title":"Analysis of Bootstrap and Subsampling in High-dimensional Regularized\n Regression","summary":" We investigate popular resampling methods for estimating the uncertainty of\nstatistical models, such as subsampling, bootstrap and the jackknife, and their\nperformance in high-dimensional supervised regression tasks. We provide a tight\nasymptotic description of the biases and variances estimated by these methods\nin the context of generalized linear models, such as ridge and logistic\nregression, taking the limit where the number of samples $n$ and dimension $d$\nof the covariates grow at a comparable fixed rate $\\alpha\\!=\\! n/d$. Our\nfindings are three-fold: i) resampling methods are fraught with problems in\nhigh dimensions and exhibit the double-descent-like behavior typical of these\nsituations; ii) only when $\\alpha$ is large enough do they provide consistent\nand reliable error estimations (we give convergence rates); iii) in the\nover-parametrized regime $\\alpha\\!<\\!1$ relevant to modern machine learning\npractice, their predictions are not consistent, even with optimal\nregularization.\n","authors":["Lucas Clarté","Adrien Vandenbroucque","Guillaume Dalle","Bruno Loureiro","Florent Krzakala","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2402.13622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19047v3","updated":"2024-11-01T13:28:59Z","published":"2024-02-29T11:20:16Z","title":"Theoretical Foundations of Deep Selective State-Space Models","summary":" Structured state-space models (SSMs) such as S4, stemming from the seminal\nwork of Gu et al., are gaining popularity as effective approaches for modeling\nsequential data. Deep SSMs demonstrate outstanding performance across a diverse\nset of domains, at a reduced training and inference cost compared to\nattention-based transformers. Recent developments show that if the linear\nrecurrence powering SSMs allows for multiplicative interactions between inputs\nand hidden states (e.g. GateLoop, Mamba, GLA), then the resulting architecture\ncan surpass in both in accuracy and efficiency attention-powered foundation\nmodels trained on text, at scales of billion parameters. In this paper, we give\ntheoretical grounding to this recent finding using tools from Rough Path\nTheory: we show that when random linear recurrences are equipped with simple\ninput-controlled transitions (selectivity mechanism), then the hidden state is\nprovably a low-dimensional projection of a powerful mathematical object called\nthe signature of the input -- capturing non-linear interactions between tokens\nat distinct timescales. Our theory not only motivates the success of modern\nselective state-space models such as Mamba but also provides a solid framework\nto understand the expressive power of future SSM variants.\n","authors":["Nicola Muca Cirone","Antonio Orvieto","Benjamin Walker","Cristopher Salvi","Terry Lyons"],"pdf_url":"https://arxiv.org/pdf/2402.19047v3.pdf","comment":"NeurIPS Version w/ minor edits"},{"id":"http://arxiv.org/abs/2210.08650v3","updated":"2024-11-01T13:02:25Z","published":"2022-10-16T22:28:36Z","title":"Accelerating Transfer Learning with Near-Data Computation on Cloud\n Object Stores","summary":" Storage disaggregation underlies today's cloud and is naturally complemented\nby pushing down some computation to storage, thus mitigating the potential\nnetwork bottleneck between the storage and compute tiers. We show how ML\ntraining benefits from storage pushdowns by focusing on transfer learning (TL),\nthe widespread technique that democratizes ML by reusing existing knowledge on\nrelated tasks. We propose HAPI, a new TL processing system centered around two\ncomplementary techniques that address challenges introduced by disaggregation.\nFirst, applications must carefully balance execution across tiers for\nperformance. HAPI judiciously splits the TL computation during the feature\nextraction phase yielding pushdowns that not only improve network time but also\nimprove total TL training time by overlapping the execution of consecutive\ntraining iterations across tiers. Second, operators want resource efficiency\nfrom the storage-side computational resources. HAPI employs storage-side batch\nsize adaptation allowing increased storage-side pushdown concurrency without\naffecting training accuracy. HAPI yields up to 2.5x training speed-up while\nchoosing in 86.8% of cases the best performing split point or one that is at\nmost 5% off from the best.\n","authors":["Diana Petrescu","Arsany Guirguis","Do Le Quoc","Javier Picorel","Rachid Guerraoui","Florin Dinu"],"pdf_url":"https://arxiv.org/pdf/2210.08650v3.pdf","comment":"To appear in the proceedings of SoCC '24"},{"id":"http://arxiv.org/abs/2311.12056v3","updated":"2024-11-01T12:54:28Z","published":"2023-11-18T13:55:05Z","title":"Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal\n satellite dataset for rapid flood mapping","summary":" Global floods, exacerbated by climate change, pose severe threats to human\nlife, infrastructure, and the environment. Recent catastrophic events in\nPakistan and New Zealand underscore the urgent need for precise flood mapping\nto guide restoration efforts, understand vulnerabilities, and prepare for\nfuture occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers\nday-and-night, all-weather imaging capabilities, its application in deep\nlearning for flood segmentation is limited by the lack of large annotated\ndatasets. To address this, we introduce Kuro Siwo, a manually annotated\nmulti-temporal dataset, spanning 43 flood events globally. Our dataset maps\nmore than 338 billion $m^2$ of land, with 33 billion designated as either\nflooded areas or permanent water bodies. Kuro Siwo includes a highly processed\nproduct optimized for flood mapping based on SAR Ground Range Detected, and a\nprimal SAR Single Look Complex product with minimal preprocessing, designed to\npromote research on the exploitation of both the phase and amplitude\ninformation and to offer maximum flexibility for downstream task preprocessing.\nTo leverage advances in large scale self-supervised pretraining methods for\nremote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR\nsamples. Finally, we provide an extensive benchmark, namely BlackBench,\noffering strong baselines for a diverse set of flood events from Europe,\nAmerica, Africa, Asia and Australia.\n","authors":["Nikolaos Ioannis Bountos","Maria Sdraka","Angelos Zavras","Ilektra Karasante","Andreas Karavias","Themistocles Herekakis","Angeliki Thanasou","Dimitrios Michail","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2311.12056v3.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2409.15246v3","updated":"2024-11-01T12:49:19Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems are crucial for cartography, disaster\nsurveillance, and resource administration. Nonetheless, they encounter\nconsiderable obstacles in the processing and transmission of extensive data,\nespecially in specialized domains such as precision agriculture and real-time\ndisaster response. Earth observation satellites, outfitted with remote sensing\ntechnology, gather data from onboard sensors and IoT-enabled terrestrial\nobjects, delivering important information remotely. Domain-adapted Large\nLanguage Models (LLMs) provide a solution by enabling the integration of raw\nand processed EO data. Through domain adaptation, LLMs improve the assimilation\nand analysis of many data sources, tackling the intricacies of specialized\ndatasets in agriculture and disaster response. This data synthesis, directed by\nLLMs, enhances the precision and pertinence of conveyed information. This study\nprovides a thorough examination of using semantic inference and deep learning\nfor sophisticated EO systems. It presents an innovative architecture for\nsemantic communication in EO satellite networks, designed to improve data\ntransmission efficiency using semantic processing methodologies. Recent\nadvancements in onboard processing technologies enable dependable, adaptable,\nand energy-efficient data management in orbit. These improvements guarantee\nreliable performance in adverse space circumstances using radiation-hardened\nand reconfigurable technology. Collectively, these advancements enable\nnext-generation satellite missions with improved processing capabilities,\ncrucial for operational flexibility and real-time decision-making in 6G\nsatellite communication.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v3.pdf","comment":"17 pages, 7 figures, Journal"},{"id":"http://arxiv.org/abs/2405.16405v2","updated":"2024-11-01T12:15:36Z","published":"2024-05-26T02:12:02Z","title":"Intruding with Words: Towards Understanding Graph Injection Attacks at\n the Text Level","summary":" Graph Neural Networks (GNNs) excel across various applications but remain\nvulnerable to adversarial attacks, particularly Graph Injection Attacks (GIAs),\nwhich inject malicious nodes into the original graph and pose realistic\nthreats. Text-attributed graphs (TAGs), where nodes are associated with textual\nfeatures, are crucial due to their prevalence in real-world applications and\nare commonly used to evaluate these vulnerabilities. However, existing research\nonly focuses on embedding-level GIAs, which inject node embeddings rather than\nactual textual content, limiting their applicability and simplifying detection.\nIn this paper, we pioneer the exploration of GIAs at the text level, presenting\nthree novel attack designs that inject textual content into the graph. Through\ntheoretical and empirical analysis, we demonstrate that text interpretability,\na factor previously overlooked at the embedding level, plays a crucial role in\nattack strength. Among the designs we investigate, the Word-frequency-based\nText-level GIA (WTGIA) is particularly notable for its balance between\nperformance and interpretability. Despite the success of WTGIA, we discover\nthat defenders can easily enhance their defenses with customized text embedding\nmethods or large language model (LLM)--based predictors. These insights\nunderscore the necessity for further research into the potential and practical\nsignificance of text-level GIAs.\n","authors":["Runlin Lei","Yuwei Hu","Yuchen Ren","Zhewei Wei"],"pdf_url":"https://arxiv.org/pdf/2405.16405v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.06963v3","updated":"2024-11-01T11:46:22Z","published":"2024-02-10T14:36:31Z","title":"Tree Ensembles for Contextual Bandits","summary":" We propose a new framework for contextual multi-armed bandits based on tree\nensembles. Our framework adapts two widely used bandit methods, Upper\nConfidence Bound and Thompson Sampling, for both standard and combinatorial\nsettings. As part of this framework, we propose a novel method of estimating\nthe uncertainty in tree ensemble predictions. We further demonstrate the\neffectiveness of our framework via several experimental studies, employing\nXGBoost and random forests, two popular tree ensemble methods. Compared to\nstate-of-the-art methods based on decision trees and neural networks, our\nmethods exhibit superior performance in terms of both regret minimization and\ncomputational runtime, when applied to benchmark datasets and the real-world\napplication of navigation over road networks.\n","authors":["Hannes Nilsson","Rikard Johansson","Niklas Åkerblom","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2402.06963v3.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2410.06726v2","updated":"2024-11-01T10:28:17Z","published":"2024-10-09T09:50:06Z","title":"Bounds and Sensitivity Analysis of the Causal Effect Under\n Outcome-Independent MNAR Confounding","summary":" We report assumption-free bounds for any contrast between the probabilities\nof the potential outcome under exposure and non-exposure when the confounders\nare missing not at random. We assume that the missingness mechanism is\noutcome-independent. We also report a sensitivity analysis method to complement\nour bounds.\n","authors":["Jose M. Peña"],"pdf_url":"https://arxiv.org/pdf/2410.06726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05300v4","updated":"2024-11-01T10:19:01Z","published":"2024-03-08T13:29:46Z","title":"Unity by Diversity: Improved Representation Learning in Multimodal VAEs","summary":" Variational Autoencoders for multimodal data hold promise for many tasks in\ndata analysis, such as representation learning, conditional generation, and\nimputation. Current architectures either share the encoder output, decoder\ninput, or both across modalities to learn a shared representation. Such\narchitectures impose hard constraints on the model. In this work, we show that\na better latent representation can be obtained by replacing these hard\nconstraints with a soft constraint. We propose a new mixture-of-experts prior,\nsoftly guiding each modality's latent representation towards a shared aggregate\nposterior. This approach results in a superior latent representation and allows\neach encoding to preserve information better from its uncompressed original\nfeatures. In extensive experiments on multiple benchmark datasets and two\nchallenging real-world datasets, we show improved learned latent\nrepresentations and imputation of missing data modalities compared to existing\nmethods.\n","authors":["Thomas M. Sutter","Yang Meng","Andrea Agostini","Daphné Chopard","Norbert Fortin","Julia E. Vogt","Bahbak Shahbaba","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2403.05300v4.pdf","comment":"Accepted at Neurips 2024"},{"id":"http://arxiv.org/abs/2405.14669v2","updated":"2024-11-01T09:56:53Z","published":"2024-05-23T15:06:02Z","title":"Efficiency for Free: Ideal Data Are Transportable Representations","summary":" Data, the seminal opportunity and challenge in modern machine learning,\ncurrently constrains the scalability of representation learning and impedes the\npace of model evolution. In this work, we investigate the efficiency properties\nof data from both optimization and generalization perspectives. Our theoretical\nand empirical analysis reveals an unexpected finding: for a given task,\nutilizing a publicly available, task- and architecture-agnostic model (referred\nto as the `prior model' in this paper) can effectively produce efficient data.\nBuilding on this insight, we propose the Representation Learning Accelerator\n(\\algopt), which promotes the formation and utilization of efficient data,\nthereby accelerating representation learning. Utilizing a ResNet-18 pre-trained\non CIFAR-10 as a prior model to inform ResNet-50 training on ImageNet-1K\nreduces computational costs by 50% while maintaining the same accuracy as the\nmodel trained with the original BYOL, which requires 100% cost. Our code is\navailable at: \\url{https://github.com/LINs-lab/ReLA}.\n","authors":["Peng Sun","Yi Jiang","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2405.14669v2.pdf","comment":"Code: https://github.com/LINs-lab/ReLA"},{"id":"http://arxiv.org/abs/2405.20778v2","updated":"2024-11-01T09:53:53Z","published":"2024-05-28T06:10:12Z","title":"Improved Generation of Adversarial Examples Against Safety-aligned LLMs","summary":" Adversarial prompts generated using gradient-based methods exhibit\noutstanding performance in performing automatic jailbreak attacks against\nsafety-aligned LLMs. Nevertheless, due to the discrete nature of texts, the\ninput gradient of LLMs struggles to precisely reflect the magnitude of loss\nchange that results from token replacements in the prompt, leading to limited\nattack success rates against safety-aligned LLMs, even in the white-box\nsetting. In this paper, we explore a new perspective on this problem,\nsuggesting that it can be alleviated by leveraging innovations inspired in\ntransfer-based attacks that were originally proposed for attacking black-box\nimage classification models. For the first time, we appropriate the ideologies\nof effective methods among these transfer-based attacks, i.e., Skip Gradient\nMethod and Intermediate Level Attack, into gradient-based adversarial prompt\ngeneration and achieve significant performance gains without introducing\nobvious computational cost. Meanwhile, by discussing mechanisms behind the\ngains, new insights are drawn, and proper combinations of these methods are\nalso developed. Our empirical results show that 87% of the query-specific\nadversarial suffixes generated by the developed combination can induce\nLlama-2-7B-Chat to produce the output that exactly matches the target string on\nAdvBench. This match rate is 33% higher than that of a very strong baseline\nknown as GCG, demonstrating advanced discrete optimization for adversarial\nprompt generation against LLMs. In addition, without introducing obvious cost,\nthe combination achieves >30% absolute increase in attack success rates\ncompared with GCG when generating both query-specific (38% -> 68%) and\nuniversal adversarial prompts (26.68% -> 60.32%) for attacking the\nLlama-2-7B-Chat model on AdvBench. Code at:\nhttps://github.com/qizhangli/Gradient-based-Jailbreak-Attacks.\n","authors":["Qizhang Li","Yiwen Guo","Wangmeng Zuo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23495v2","updated":"2024-11-01T09:49:24Z","published":"2024-10-30T22:57:54Z","title":"DASH: Warm-Starting Neural Network Training in Stationary Settings\n without Loss of Plasticity","summary":" Warm-starting neural network training by initializing networks with\npreviously learned weights is appealing, as practical neural networks are often\ndeployed under a continuous influx of new data. However, it often leads to loss\nof plasticity, where the network loses its ability to learn new information,\nresulting in worse generalization than training from scratch. This occurs even\nunder stationary data distributions, and its underlying mechanism is poorly\nunderstood. We develop a framework emulating real-world neural network training\nand identify noise memorization as the primary cause of plasticity loss when\nwarm-starting on stationary data. Motivated by this, we propose Direction-Aware\nSHrinking (DASH), a method aiming to mitigate plasticity loss by selectively\nforgetting memorized noise while preserving learned features. We validate our\napproach on vision tasks, demonstrating improvements in test accuracy and\ntraining efficiency.\n","authors":["Baekrok Shin","Junsoo Oh","Hanseul Cho","Chulhee Yun"],"pdf_url":"https://arxiv.org/pdf/2410.23495v2.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.19192v2","updated":"2024-11-01T09:45:29Z","published":"2024-10-24T22:50:21Z","title":"TEAM: Topological Evolution-aware Framework for Traffic\n Forecasting--Extended Version","summary":" Due to the global trend towards urbanization, people increasingly move to and\nlive in cities that then continue to grow. Traffic forecasting plays an\nimportant role in the intelligent transportation systems of cities as well as\nin spatio-temporal data mining. State-of-the-art forecasting is achieved by\ndeep-learning approaches due to their ability to contend with complex\nspatio-temporal dynamics. However, existing methods assume the input is\nfixed-topology road networks and static traffic time series. These assumptions\nfail to align with urbanization, where time series are collected continuously\nand road networks evolve over time. In such settings, deep-learning models\nrequire frequent re-initialization and re-training, imposing high computational\ncosts. To enable much more efficient training without jeopardizing model\naccuracy, we propose the Topological Evolution-aware Framework (TEAM) for\ntraffic forecasting that incorporates convolution and attention. This\ncombination of mechanisms enables better adaptation to newly collected time\nseries, while being able to maintain learned knowledge from old time series.\nTEAM features a continual learning module based on the Wasserstein metric that\nacts as a buffer that can identify the most stable and the most changing\nnetwork nodes. Then, only data related to stable nodes is employed for\nre-training when consolidating a model. Further, only data of new nodes and\ntheir adjacent nodes as well as data pertaining to changing nodes are used to\nre-train the model. Empirical studies with two real-world traffic datasets\noffer evidence that TEAM is capable of much lower re-training costs than\nexisting methods are, without jeopardizing forecasting accuracy.\n","authors":["Duc Kieu","Tung Kieu","Peng Han","Bin Yang","Christian S. Jensen","Bac Le"],"pdf_url":"https://arxiv.org/pdf/2410.19192v2.pdf","comment":"16 pages. An extended version of \"TEAM: Topological Evolution-aware\n Framework for Traffic Forecasting\" accepted at PVLDB 2025"},{"id":"http://arxiv.org/abs/2410.24070v2","updated":"2024-11-01T09:41:09Z","published":"2024-10-31T16:07:21Z","title":"Dynamical similarity analysis uniquely captures how computations develop\n in RNNs","summary":" Methods for analyzing representations in neural systems are increasingly\npopular tools in neuroscience and mechanistic interpretability. Measures\ncomparing neural activations across conditions, architectures, and species give\nscalable ways to understand information transformation within different neural\nnetworks. However, recent findings show that some metrics respond to spurious\nsignals, leading to misleading results. Establishing benchmark test cases is\nthus essential for identifying the most reliable metric and potential\nimprovements. We propose that compositional learning in recurrent neural\nnetworks (RNNs) can provide a test case for dynamical representation alignment\nmetrics. Implementing this case allows us to evaluate if metrics can identify\nrepresentations that develop throughout learning and determine if\nrepresentations identified by metrics reflect the network's actual\ncomputations. Building both attractor and RNN based test cases, we show that\nthe recently proposed Dynamical Similarity Analysis (DSA) is more noise robust\nand reliably identifies behaviorally relevant representations compared to prior\nmetrics (Procrustes, CKA). We also demonstrate how such test cases can extend\nbeyond metric evaluation to study new architectures. Specifically, testing DSA\nin modern (Mamba) state space models suggests that these models, unlike RNNs,\nmay not require changes in recurrent dynamics due to their expressive hidden\nstates. Overall, we develop test cases that showcase how DSA's enhanced ability\nto detect dynamical motifs makes it highly effective for identifying ongoing\ncomputations in RNNs and revealing how networks learn tasks.\n","authors":["Quentin Guilhot","Michał Wójcik","Jascha Achterberg","Rui Ponte Costa"],"pdf_url":"https://arxiv.org/pdf/2410.24070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14716v3","updated":"2024-11-01T09:38:59Z","published":"2024-10-11T13:17:19Z","title":"A Systematic Survey on Large Language Models for Algorithm Design","summary":" Algorithm Design (AD) is crucial for effective problem-solving across various\ndomains. The advent of Large Language Models (LLMs) has notably enhanced the\nautomation and innovation within this field, offering new perspectives and\npromising solutions. Over the past three years, the integration of LLMs into AD\n(LLM4AD) has seen substantial progress, with applications spanning\noptimization, machine learning, mathematical reasoning, and scientific\ndiscovery. Given the rapid advancements and expanding scope of this field, a\nsystematic review is both timely and necessary. This paper provides a\nsystematic review of LLM4AD. First, we offer an overview and summary of\nexisting studies. Then, we introduce a taxonomy and review the literature\nacross four dimensions: the roles of LLMs, search methods, prompt methods, and\napplication domains with a discussion of potential and achievements of LLMs in\nAD. Finally, we identify current challenges and highlight several promising\ndirections for future research.\n","authors":["Fei Liu","Yiming Yao","Ping Guo","Zhiyuan Yang","Zhe Zhao","Xi Lin","Xialiang Tong","Mingxuan Yuan","Zhichao Lu","Zhenkun Wang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.14716v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16208v4","updated":"2024-11-01T09:07:41Z","published":"2023-06-28T13:43:46Z","title":"Continuous-time q-learning for mean-field control problems","summary":" This paper studies the q-learning, recently coined as the continuous time\ncounterpart of Q-learning by Jia and Zhou (2023), for continuous time\nMckean-Vlasov control problems in the setting of entropy-regularized\nreinforcement learning. In contrast to the single agent's control problem in\nJia and Zhou (2023), the mean-field interaction of agents renders the\ndefinition of the q-function more subtle, for which we reveal that two distinct\nq-functions naturally arise: (i) the integrated q-function (denoted by $q$) as\nthe first-order approximation of the integrated Q-function introduced in Gu,\nGuo, Wei and Xu (2023), which can be learnt by a weak martingale condition\ninvolving test policies; and (ii) the essential q-function (denoted by $q_e$)\nthat is employed in the policy improvement iterations. We show that two\nq-functions are related via an integral representation under all test policies.\nBased on the weak martingale condition and our proposed searching method of\ntest policies, some model-free learning algorithms are devised. In two\nexamples, one in LQ control framework and one beyond LQ control framework, we\ncan obtain the exact parameterization of the optimal value function and\nq-functions and illustrate our algorithms with simulation experiments.\n","authors":["Xiaoli Wei","Xiang Yu"],"pdf_url":"https://arxiv.org/pdf/2306.16208v4.pdf","comment":"Keywords: Continuous-time reinforcement learning, integrated\n q-function, mean-field control, weak martingale characterization, test\n policies"},{"id":"http://arxiv.org/abs/2404.05019v2","updated":"2024-11-01T08:55:43Z","published":"2024-04-07T17:17:23Z","title":"Shortcut-connected Expert Parallelism for Accelerating\n Mixture-of-Experts","summary":" Expert parallelism has been introduced as a strategy to distribute the\ncomputational workload of sparsely-gated mixture-of-experts (MoE) models across\nmultiple computing devices, facilitating the execution of these increasingly\nlarge-scale models. However, the All-to-All communication intrinsic to expert\nparallelism constitutes a significant overhead, diminishing the MoE models'\nefficiency. Current optimization approaches offer some relief, yet they are\nconstrained by the sequential interdependence of communication and computation\noperations. To address this limitation, we present a novel shortcut-connected\nMoE (ScMoE) architecture with an overlapping parallel strategy, which\neffectively decouples communication from its conventional sequence, allowing\nfor a substantial overlap of 70% to 100% with computation. When compared with\nthe prevalent top-2 MoE architecture, ScMoE demonstrates training speed\nimprovements of 30% and 11%, and inference improvements of 40% and 15%, in our\ndistributed environments with PCIe and NVLink hardware, respectively, where\ncommunication constitutes 60% and 15% of the total MoE time consumption.\nBuilding on the ScMoE architecture, we further implement an expert offloading\nstrategy to facilitate memory-limited inference, optimizing latency through the\noverlap of expert migration. Additionally, extensive experiments and\ntheoretical analyses indicate that ScMoE not only achieves comparable but in\nsome instances surpasses the model quality of existing approaches.\n","authors":["Weilin Cai","Juyong Jiang","Le Qin","Junwei Cui","Sunghun Kim","Jiayi Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02657v2","updated":"2024-11-01T08:52:18Z","published":"2024-06-04T17:45:26Z","title":"Block Transformer: Global-to-Local Language Modeling for Fast Inference","summary":" We introduce the Block Transformer which adopts hierarchical global-to-local\nmodeling to autoregressive transformers to mitigate the inference bottlenecks\nassociated with self-attention. Self-attention requires the key-value (KV)\ncache of all previous sequences to be retrieved from memory at every decoding\nstep to retrieve context information, leading to two primary bottlenecks during\nbatch inference. First, there is a significant delay in obtaining the first\ntoken, as the information of the entire prompt must first be processed to\nprefill the KV cache. Second, computation of subsequent tokens is bottlenecked\nby the high memory I/O demand of fetching the entire KV cache, which grows\nlinearly with sequence length, incurring quadratic memory reads overall. We\ndesign the Block Transformer to strategically mitigate these costs, by\nincorporating coarsity and locality into an integrated global-to-local\narchitecture. At the lower layers, we aggregate tokens into fixed size blocks\nto apply attention across the entire sequence at coarse-grained detail, to\ncapture the global context while minimizing KV cache overhead. At upper layers,\nwe apply attention within each block to decode individual tokens, to model\nfine-grained details with a lightweight local KV cache. We pretrain vanilla and\nBlock Transformers from scratch and demonstrate that Block Transformers reach\n10--20x inference throughput compared to vanilla transformers with equivalent\nperplexity and zero-shot task performance. Code is available at\nhttps://github.com/itsnamgyu/block-transformer.\n","authors":["Namgyu Ho","Sangmin Bae","Taehyeon Kim","Hyunjik Jo","Yireun Kim","Tal Schuster","Adam Fisch","James Thorne","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2406.02657v2.pdf","comment":"37 pages, 24 figures, 7 tables"},{"id":"http://arxiv.org/abs/2407.16131v2","updated":"2024-11-01T08:25:56Z","published":"2024-07-23T02:31:06Z","title":"CrysToGraph: A Comprehensive Predictive Model for Crystal Materials\n Properties and the Benchmark","summary":" The ionic bonding across the lattice and ordered microscopic structures endow\ncrystals with unique symmetry and determine their macroscopic properties.\nUnconventional crystals, in particular, exhibit non-traditional lattice\nstructures or possess exotic physical properties, making them intriguing\nsubjects for investigation. Therefore, to accurately predict the physical and\nchemical properties of crystals, it is crucial to consider long-range orders.\nWhile GNN excels at capturing the local environment of atoms in crystals, they\noften face challenges in effectively capturing longer-ranged interactions due\nto their limited depth. In this paper, we propose CrysToGraph\n($\\textbf{Crys}$tals with $\\textbf{T}$ransformers $\\textbf{o}$n\n$\\textbf{Graph}$s), a novel transformer-based geometric graph network designed\nspecifically for unconventional crystalline systems, and UnconvBench, a\ncomprehensive benchmark to evaluate models' predictive performance on\nunconventional crystal materials such as defected crystals, low-dimension\ncrystals and MOF. CrysToGraph effectively captures short-range interactions\nwith transformer-based graph convolution blocks as well as long-range\ninteractions with graph-wise transformer blocks. CrysToGraph proofs its\neffectiveness in modelling unconventional crystal materials in multiple tasks,\nand moreover, it outperforms most existing methods, achieving new\nstate-of-the-art results on the benchmarks of both unconventional crystals and\ntraditional crystals.\n","authors":["Hongyi Wang","Ji Sun","Jinzhe Liang","Li Zhai","Zitian Tang","Zijian Li","Wei Zhai","Xusheng Wang","Weihao Gao","Sheng Gong"],"pdf_url":"https://arxiv.org/pdf/2407.16131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04234v5","updated":"2024-11-01T08:16:52Z","published":"2023-12-07T11:40:32Z","title":"Graph Convolutions Enrich the Self-Attention in Transformers!","summary":" Transformers, renowned for their self-attention mechanism, have achieved\nstate-of-the-art performance across various tasks in natural language\nprocessing, computer vision, time-series modeling, etc. However, one of the\nchallenges with deep Transformer models is the oversmoothing problem, where\nrepresentations across layers converge to indistinguishable values, leading to\nsignificant performance degradation. We interpret the original self-attention\nas a simple graph filter and redesign it from a graph signal processing (GSP)\nperspective. We propose a graph-filter-based self-attention (GFSA) to learn a\ngeneral yet effective one, whose complexity, however, is slightly larger than\nthat of the original self-attention mechanism. We demonstrate that GFSA\nimproves the performance of Transformers in various fields, including computer\nvision, natural language processing, graph-level tasks, speech recognition, and\ncode classification.\n","authors":["Jeongwhan Choi","Hyowon Wi","Jayoung Kim","Yehjin Shin","Kookjin Lee","Nathaniel Trask","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2312.04234v5.pdf","comment":"Accepted to NeurIPS 2024. Jeongwhan Choi and Hyowon Wi are co-first\n authors with equal contributions"},{"id":"http://arxiv.org/abs/2410.23994v2","updated":"2024-11-01T07:55:34Z","published":"2024-10-31T14:52:01Z","title":"Breaking Determinism: Fuzzy Modeling of Sequential Recommendation Using\n Discrete State Space Diffusion Model","summary":" Sequential recommendation (SR) aims to predict items that users may be\ninterested in based on their historical behavior sequences. We revisit SR from\na novel information-theoretic perspective and find that conventional sequential\nmodeling methods fail to adequately capture the randomness and unpredictability\nof user behavior. Inspired by fuzzy information processing theory, this paper\nintroduces the DDSR model, which uses fuzzy sets of interaction sequences to\novercome the limitations and better capture the evolution of users' real\ninterests. Formally based on diffusion transition processes in discrete state\nspaces, which is unlike common diffusion models such as DDPM that operate in\ncontinuous domains. It is better suited for discrete data, using structured\ntransitions instead of arbitrary noise introduction to avoid information loss.\nAdditionally, to address the inefficiency of matrix transformations due to the\nvast discrete space, we use semantic labels derived from quantization or RQ-VAE\nto replace item IDs, enhancing efficiency and improving cold start issues.\nTesting on three public benchmark datasets shows that DDSR outperforms existing\nstate-of-the-art methods in various settings, demonstrating its potential and\neffectiveness in handling SR tasks.\n","authors":["Wenjia Xie","Hao Wang","Luankang Zhang","Rui Zhou","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.23994v2.pdf","comment":"NeurIPS'2024, 10 pages"},{"id":"http://arxiv.org/abs/2306.06190v3","updated":"2024-11-01T07:53:10Z","published":"2023-06-09T18:42:19Z","title":"$FastDoc$: Domain-Specific Fast Continual Pre-training Technique using\n Document-Level Metadata and Taxonomy","summary":" In this paper, we propose $FastDoc$ (Fast Continual Pre-training Technique\nusing Document Level Metadata and Taxonomy), a novel, compute-efficient\nframework that utilizes Document metadata and Domain-Specific Taxonomy as\nsupervision signals to continually pre-train transformer encoder on a\ndomain-specific corpus. The main innovation is that during domain-specific\npretraining, an open-domain encoder is continually pre-trained using\nsentence-level embeddings as inputs (to accommodate long documents), however,\nfine-tuning is done with token-level embeddings as inputs to this encoder. We\nperform such domain-specific pre-training on three different domains namely\ncustomer support, scientific, and legal domains, and compare performance on 6\ndifferent downstream tasks and 9 different datasets. The novel use of\ndocument-level supervision along with sentence-level embedding input for\npre-training reduces pre-training compute by around $1,000$, $4,500$, and $500$\ntimes compared to MLM and/or NSP in Customer Support, Scientific, and Legal\nDomains, respectively. The reduced training time does not lead to a\ndeterioration in performance. In fact we show that $FastDoc$ either outperforms\nor performs on par with several competitive transformer-based baselines in\nterms of character-level F1 scores and other automated metrics in the Customer\nSupport, Scientific, and Legal Domains. Moreover, reduced training aids in\nmitigating the risk of catastrophic forgetting. Thus, unlike baselines,\n$FastDoc$ shows a negligible drop in performance on open domain.\n","authors":["Abhilash Nandy","Manav Nitin Kapadnis","Sohan Patnaik","Yash Parag Butala","Pawan Goyal","Niloy Ganguly"],"pdf_url":"https://arxiv.org/pdf/2306.06190v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), 36\n pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.13752v3","updated":"2024-11-01T07:51:36Z","published":"2024-04-21T19:24:15Z","title":"Adversarial Representation Engineering: A General Model Editing\n Framework for Large Language Models","summary":" Since the rapid development of Large Language Models (LLMs) has achieved\nremarkable success, understanding and rectifying their internal complex\nmechanisms has become an urgent issue. Recent research has attempted to\ninterpret their behaviors through the lens of inner representation. However,\ndeveloping practical and efficient methods for applying these representations\nfor general and flexible model editing remains challenging. In this work, we\nexplore how to leverage insights from representation engineering to guide the\nediting of LLMs by deploying a representation sensor as an editing oracle. We\nfirst identify the importance of a robust and reliable sensor during editing,\nthen propose an Adversarial Representation Engineering (ARE) framework to\nprovide a unified and interpretable approach for conceptual model editing\nwithout compromising baseline performance. Experiments on multiple tasks\ndemonstrate the effectiveness of ARE in various model editing scenarios. Our\ncode and data are available at\nhttps://github.com/Zhang-Yihao/Adversarial-Representation-Engineering.\n","authors":["Yihao Zhang","Zeming Wei","Jun Sun","Meng Sun"],"pdf_url":"https://arxiv.org/pdf/2404.13752v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.19400v4","updated":"2024-11-01T07:20:10Z","published":"2024-10-25T09:01:37Z","title":"Offline Reinforcement Learning with OOD State Correction and OOD Action\n Suppression","summary":" In offline reinforcement learning (RL), addressing the out-of-distribution\n(OOD) action issue has been a focus, but we argue that there exists an OOD\nstate issue that also impairs performance yet has been underexplored. Such an\nissue describes the scenario when the agent encounters states out of the\noffline dataset during the test phase, leading to uncontrolled behavior and\nperformance degradation. To this end, we propose SCAS, a simple yet effective\napproach that unifies OOD state correction and OOD action suppression in\noffline RL. Technically, SCAS achieves value-aware OOD state correction,\ncapable of correcting the agent from OOD states to high-value in-distribution\nstates. Theoretical and empirical results show that SCAS also exhibits the\neffect of suppressing OOD actions. On standard offline RL benchmarks, SCAS\nachieves excellent performance without additional hyperparameter tuning.\nMoreover, benefiting from its OOD state correction feature, SCAS demonstrates\nenhanced robustness against environmental perturbations.\n","authors":["Yixiu Mao","Qi Wang","Chen Chen","Yun Qu","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2410.19400v4.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.01475v2","updated":"2024-11-01T07:05:33Z","published":"2024-04-01T20:56:25Z","title":"Are large language models superhuman chemists?","summary":" Large language models (LLMs) have gained widespread interest due to their\nability to process human language and perform tasks on which they have not been\nexplicitly trained.\n However, we possess only a limited systematic understanding of the chemical\ncapabilities of LLMs, which would be required to improve models and mitigate\npotential harm. Here, we introduce \"ChemBench,\" an automated framework for\nevaluating the chemical knowledge and reasoning abilities of state-of-the-art\nLLMs against the expertise of chemists.\n We curated more than 2,700 question-answer pairs, evaluated leading open- and\nclosed-source LLMs, and found that the best models outperformed the best human\nchemists in our study on average. However, the models struggle with some basic\ntasks and provide overconfident predictions.\n These findings reveal LLMs' impressive chemical capabilities while\nemphasizing the need for further research to improve their safety and\nusefulness. They also suggest adapting chemistry education and show the value\nof benchmarking frameworks for evaluating LLMs in specific domains.\n","authors":["Adrian Mirza","Nawaf Alampara","Sreekanth Kunchapu","Martiño Ríos-García","Benedict Emoekabu","Aswanth Krishnan","Tanya Gupta","Mara Schilling-Wilhelmi","Macjonathan Okereke","Anagha Aneesh","Amir Mohammad Elahi","Mehrdad Asgari","Juliane Eberhardt","Hani M. Elbeheiry","María Victoria Gil","Maximilian Greiner","Caroline T. Holick","Christina Glaubitz","Tim Hoffmann","Abdelrahman Ibrahim","Lea C. Klepsch","Yannik Köster","Fabian Alexander Kreth","Jakob Meyer","Santiago Miret","Jan Matthias Peschel","Michael Ringleb","Nicole Roesner","Johanna Schreiber","Ulrich S. Schubert","Leanne M. Stafast","Dinga Wonanke","Michael Pieler","Philippe Schwaller","Kevin Maik Jablonka"],"pdf_url":"https://arxiv.org/pdf/2404.01475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15665v2","updated":"2024-11-01T06:57:43Z","published":"2024-10-21T06:09:30Z","title":"Long Term Memory: The Foundation of AI Self-Evolution","summary":" Large language models (LLMs) like GPTs, trained on vast datasets, have\ndemonstrated impressive capabilities in language understanding, reasoning, and\nplanning, achieving human-level performance in various tasks. Most studies\nfocus on enhancing these models by training on ever-larger datasets to build\nmore powerful foundation models. While training stronger models is important,\nenabling models to evolve during inference is equally crucial, a process we\nrefer to as AI self-evolution. Unlike large-scale training, self-evolution may\nrely on limited data or interactions. Inspired by the columnar organization of\nthe human cerebral cortex, we hypothesize that AI models could develop\ncognitive abilities and build internal representations through iterative\ninteractions with their environment. To achieve this, models need long-term\nmemory (LTM) to store and manage processed interaction data. LTM supports\nself-evolution by representing diverse experiences across environments and\nagents. In this report, we explore AI self-evolution and its potential to\nenhance models during inference. We examine LTM's role in lifelong learning,\nallowing models to evolve based on accumulated interactions. We outline the\nstructure of LTM and the systems needed for effective data retention and\nrepresentation. We also classify approaches for building personalized models\nwith LTM data and show how these models achieve self-evolution through\ninteraction. Using LTM, our multi-agent framework OMNE achieved first place on\nthe GAIA benchmark, demonstrating LTM's potential for AI self-evolution.\nFinally, we present a roadmap for future research, emphasizing the importance\nof LTM for advancing AI technology and its practical applications.\n","authors":["Xun Jiang","Feng Li","Han Zhao","Jiaying Wang","Jun Shao","Shihao Xu","Shu Zhang","Weiling Chen","Xavier Tang","Yize Chen","Mengyue Wu","Weizhi Ma","Mengdi Wang","Tianqiao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.15665v2.pdf","comment":"56 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.12553v3","updated":"2024-11-01T06:45:41Z","published":"2024-03-19T08:56:20Z","title":"Pretraining Codomain Attention Neural Operators for Solving Multiphysics\n PDEs","summary":" Existing neural operator architectures face challenges when solving\nmultiphysics problems with coupled partial differential equations (PDEs) due to\ncomplex geometries, interactions between physical variables, and the limited\namounts of high-resolution training data. To address these issues, we propose\nCodomain Attention Neural Operator (CoDA-NO), which tokenizes functions along\nthe codomain or channel space, enabling self-supervised learning or pretraining\nof multiple PDE systems. Specifically, we extend positional encoding,\nself-attention, and normalization layers to function spaces. CoDA-NO can learn\nrepresentations of different PDE systems with a single model. We evaluate\nCoDA-NO's potential as a backbone for learning multiphysics PDEs over multiple\nsystems by considering few-shot learning settings. On complex downstream tasks\nwith limited data, such as fluid flow simulations, fluid-structure\ninteractions, and Rayleigh-B\\'enard convection, we found CoDA-NO to outperform\nexisting methods by over 36%.\n","authors":["Md Ashiqur Rahman","Robert Joseph George","Mogab Elleithy","Daniel Leibovici","Zongyi Li","Boris Bonev","Colin White","Julius Berner","Raymond A. Yeh","Jean Kossaifi","Kamyar Azizzadenesheli","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2403.12553v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09345v5","updated":"2024-11-01T06:30:11Z","published":"2024-02-14T17:49:07Z","title":"InfoRM: Mitigating Reward Hacking in RLHF via Information-Theoretic\n Reward Modeling","summary":" Despite the success of reinforcement learning from human feedback (RLHF) in\naligning language models with human values, reward hacking, also termed reward\noveroptimization, remains a critical challenge. This issue primarily arises\nfrom reward misgeneralization, where reward models (RMs) compute reward using\nspurious features that are irrelevant to human preferences. In this work, we\ntackle this problem from an information-theoretic perspective and propose a\nframework for reward modeling, namely InfoRM, by introducing a variational\ninformation bottleneck objective to filter out irrelevant information. Notably,\nwe further identify a correlation between overoptimization and outliers in the\nIB latent space of InfoRM, establishing it as a promising tool for detecting\nreward overoptimization. Inspired by this finding, we propose the Cluster\nSeparation Index (CSI), which quantifies deviations in the IB latent space, as\nan indicator of reward overoptimization to facilitate the development of online\nmitigation strategies. Extensive experiments on a wide range of settings and RM\nscales (70M, 440M, 1.4B, and 7B) demonstrate the effectiveness of InfoRM.\nFurther analyses reveal that InfoRM's overoptimization detection mechanism is\nnot only effective but also robust across a broad range of datasets, signifying\na notable advancement in the field of RLHF. The code will be released upon\nacceptance.\n","authors":["Yuchun Miao","Sen Zhang","Liang Ding","Rong Bao","Lefei Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.09345v5.pdf","comment":"The paper has been accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.17673v2","updated":"2024-11-01T06:22:30Z","published":"2024-05-27T21:50:16Z","title":"Fast Samplers for Inverse Problems in Iterative Refinement Models","summary":" Constructing fast samplers for unconditional diffusion and flow-matching\nmodels has received much attention recently; however, existing methods for\nsolving inverse problems, such as super-resolution, inpainting, or deblurring,\nstill require hundreds to thousands of iterative steps to obtain high-quality\nresults. We propose a plug-and-play framework for constructing efficient\nsamplers for inverse problems, requiring only pre-trained diffusion or\nflow-matching models. We present Conditional Conjugate Integrators, which\nleverage the specific form of the inverse problem to project the respective\nconditional diffusion/flow dynamics into a more amenable space for sampling.\nOur method complements popular posterior approximation methods for solving\ninverse problems using diffusion/flow models. We evaluate the proposed method's\nperformance on various linear image restoration tasks across multiple datasets,\nemploying diffusion and flow-matching models. Notably, on challenging inverse\nproblems like 4x super-resolution on the ImageNet dataset, our method can\ngenerate high-quality samples in as few as 5 conditional sampling steps and\noutperforms competing baselines requiring 20-1000 steps. Our code will be\npublicly available at https://github.com/mandt-lab/c-pigdm\n","authors":["Kushagra Pandey","Ruihan Yang","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2405.17673v2.pdf","comment":"43 pages, NeurIPS'24 Camera Ready"},{"id":"http://arxiv.org/abs/2410.01548v2","updated":"2024-11-01T06:12:33Z","published":"2024-10-02T13:37:54Z","title":"In-Context Transfer Learning: Demonstration Synthesis by Transferring\n Similar Tasks","summary":" In-context learning (ICL) is an effective approach to help large language\nmodels (LLMs) adapt to various tasks by providing demonstrations of the target\ntask. Considering the high cost of labeling demonstrations, many methods\npropose synthesizing demonstrations from scratch using LLMs. However, the\nquality of the demonstrations synthesized from scratch is limited by the\ncapabilities and knowledge of LLMs. To address this, inspired by transfer\nlearning, we propose In-Context Transfer Learning (ICTL), which synthesizes\ntarget task demonstrations by transferring labeled demonstrations from similar\nsource tasks. ICTL consists of two steps: source sampling and target transfer.\nFirst, we define an optimization objective, which minimizes transfer error to\nsample source demonstrations similar to the target task. Then, we employ LLMs\nto transfer the sampled source demonstrations to the target task, matching the\ndefinition and format of the target task. Experiments on Super-NI show that\nICTL outperforms synthesis from scratch by 2.0% on average, demonstrating the\neffectiveness of our method.\n","authors":["Dingzirui Wang","Xuanliang Zhang","Qiguang Chen","Longxu Dou","Xiao Xu","Rongyu Cao","Yingwei Ma","Qingfu Zhu","Wanxiang Che","Binhua Li","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2410.01548v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11802v3","updated":"2024-11-01T05:14:40Z","published":"2024-10-15T17:23:49Z","title":"FoundTS: Comprehensive and Unified Benchmarking of Foundation Models for\n Time Series Forecasting","summary":" Time Series Forecasting (TSF) is key functionality in numerous fields,\nincluding in finance, weather services, and energy management. While TSF\nmethods are emerging these days, many of them require domain-specific data\ncollection and model training and struggle with poor generalization performance\non new domains. Foundation models aim to overcome this limitation. Pre-trained\non large-scale language or time series data, they exhibit promising inferencing\ncapabilities in new or unseen data. This has spurred a surge in new TSF\nfoundation models. We propose a new benchmark, FoundTS, to enable thorough and\nfair evaluation and comparison of such models. FoundTS covers a variety of TSF\nfoundation models, including those based on large language models and those\npretrained on time series. Next, FoundTS supports different forecasting\nstrategies, including zero-shot, few-shot, and full-shot, thereby facilitating\nmore thorough evaluations. Finally, FoundTS offers a pipeline that standardizes\nevaluation processes such as dataset splitting, loading, normalization, and\nfew-shot sampling, thereby facilitating fair evaluations. Building on this, we\nreport on an extensive evaluation of TSF foundation models on a broad range of\ndatasets from diverse domains and with different statistical characteristics.\nSpecifically, we identify pros and cons and inherent limitations of existing\nfoundation models, and we identify directions for future model design. We make\nour code and datasets available at\nhttps://anonymous.4open.science/r/FoundTS-C2B0.\n","authors":["Zhe Li","Xiangfei Qiu","Peng Chen","Yihang Wang","Hanyin Cheng","Yang Shu","Jilin Hu","Chenjuan Guo","Aoying Zhou","Qingsong Wen","Christian S. Jensen","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2410.11802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00986v2","updated":"2024-11-01T05:03:19Z","published":"2024-04-01T08:18:38Z","title":"Make Continual Learning Stronger via C-Flat","summary":" Model generalization ability upon incrementally acquiring dynamically\nupdating knowledge from sequentially arriving tasks is crucial to tackle the\nsensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape\nsharpness minimization seeking for flat minima lying in neighborhoods with\nuniform low loss or smooth gradient is proven to be a strong training regime\nimproving model generalization compared with loss minimization based optimizer\nlike SGD. Yet only a few works have discussed this training regime for CL,\nproving that dedicated designed zeroth-order sharpness optimizer can improve CL\nperformance. In this work, we propose a Continual Flatness (C-Flat) method\nfeaturing a flatter loss landscape tailored for CL. C-Flat could be easily\ncalled with only one line of code and is plug-and-play to any CL methods. A\ngeneral framework of C-Flat applied to all CL categories and a thorough\ncomparison with loss minima optimizer and flat minima based CL approaches is\npresented in this paper, showing that our method can boost CL performance in\nalmost all cases. Code is available at https://github.com/WanNaa/C-Flat.\n","authors":["Ang Bian","Wei Li","Hangjie Yuan","Chengrong Yu","Mang Wang","Zixiang Zhao","Aojun Lu","Pengliang Ji","Tao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05234v3","updated":"2024-11-01T04:34:07Z","published":"2024-02-07T20:14:22Z","title":"QGFN: Controllable Greediness with Action Values","summary":" Generative Flow Networks (GFlowNets; GFNs) are a family of energy-based\ngenerative methods for combinatorial objects, capable of generating diverse and\nhigh-utility samples. However, consistently biasing GFNs towards producing\nhigh-utility samples is non-trivial. In this work, we leverage connections\nbetween GFNs and reinforcement learning (RL) and propose to combine the GFN\npolicy with an action-value estimate, $Q$, to create greedier sampling policies\nwhich can be controlled by a mixing parameter. We show that several variants of\nthe proposed method, QGFN, are able to improve on the number of high-reward\nsamples generated in a variety of tasks without sacrificing diversity.\n","authors":["Elaine Lau","Stephen Zhewen Lu","Ling Pan","Doina Precup","Emmanuel Bengio"],"pdf_url":"https://arxiv.org/pdf/2402.05234v3.pdf","comment":"Accepted by 38th Conference on Neural Information Processing Systems\n (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.23938v2","updated":"2024-11-01T04:28:59Z","published":"2024-10-31T13:52:59Z","title":"Learning Macroscopic Dynamics from Partial Microscopic Observations","summary":" Macroscopic observables of a system are of keen interest in real applications\nsuch as the design of novel materials. Current methods rely on microscopic\ntrajectory simulations, where the forces on all microscopic coordinates need to\nbe computed or measured. However, this can be computationally prohibitive for\nrealistic systems. In this paper, we propose a method to learn macroscopic\ndynamics requiring only force computations on a subset of the microscopic\ncoordinates. Our method relies on a sparsity assumption: the force on each\nmicroscopic coordinate relies only on a small number of other coordinates. The\nmain idea of our approach is to map the training procedure on the macroscopic\ncoordinates back to the microscopic coordinates, on which partial force\ncomputations can be used as stochastic estimation to update model parameters.\nWe provide a theoretical justification of this under suitable conditions. We\ndemonstrate the accuracy, force computation efficiency, and robustness of our\nmethod on learning macroscopic closure models from a variety of microscopic\nsystems, including those modeled by partial differential equations or molecular\ndynamics simulations.\n","authors":["Mengyi Chen","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2410.23938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19207v2","updated":"2024-11-01T04:14:52Z","published":"2024-10-24T23:36:39Z","title":"Equitable Federated Learning with Activation Clustering","summary":" Federated learning is a prominent distributed learning paradigm that\nincorporates collaboration among diverse clients, promotes data locality, and\nthus ensures privacy. These clients have their own technological, cultural, and\nother biases in the process of data generation. However, the present standard\noften ignores this bias/heterogeneity, perpetuating bias against certain groups\nrather than mitigating it. In response to this concern, we propose an equitable\nclustering-based framework where the clients are categorized/clustered based on\nhow similar they are to each other. We propose a unique way to construct the\nsimilarity matrix that uses activation vectors. Furthermore, we propose a\nclient weighing mechanism to ensure that each cluster receives equal importance\nand establish $O(1/\\sqrt{K})$ rate of convergence to reach an\n$\\epsilon-$stationary solution. We assess the effectiveness of our proposed\nstrategy against common baselines, demonstrating its efficacy in terms of\nreducing the bias existing amongst various client clusters and consequently\nameliorating algorithmic bias against specific groups.\n","authors":["Antesh Upadhyay","Abolfazl Hashemi"],"pdf_url":"https://arxiv.org/pdf/2410.19207v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2310.13220v2","updated":"2024-11-01T04:04:44Z","published":"2023-10-20T01:55:34Z","title":"Towards Understanding How Transformers Learn In-context Through a\n Representation Learning Lens","summary":" Pre-trained large language models based on Transformers have demonstrated\nremarkable in-context learning (ICL) abilities. With just a few demonstration\nexamples, the models can implement new tasks without any parameter updates.\nHowever, it is still an open question to understand the mechanism of ICL. In\nthis paper, we attempt to explore the ICL process in Transformers through a\nlens of representation learning. Initially, leveraging kernel methods, we\nfigure out a dual model for one softmax attention layer. The ICL inference\nprocess of the attention layer aligns with the training procedure of its dual\nmodel, generating token representation predictions that are equivalent to the\ndual model's test outputs. We delve into the training process of this dual\nmodel from a representation learning standpoint and further derive a\ngeneralization error bound related to the quantity of demonstration tokens.\nSubsequently, we extend our theoretical conclusions to more complicated\nscenarios, including one Transformer layer and multiple attention layers.\nFurthermore, drawing inspiration from existing representation learning methods\nespecially contrastive learning, we propose potential modifications for the\nattention layer. Finally, experiments are designed to support our findings.\n","authors":["Ruifeng Ren","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13220v2.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2405.00957v2","updated":"2024-11-01T03:51:18Z","published":"2024-05-02T02:38:32Z","title":"IntraMix: Intra-Class Mixup Generation for Accurate Labels and Neighbors","summary":" Graph Neural Networks (GNNs) have shown great performance in various tasks,\nwith the core idea of learning from data labels and aggregating messages within\nthe neighborhood of nodes. However, the common challenges in graphs are\ntwofold: insufficient accurate (high-quality) labels and limited neighbors for\nnodes, resulting in weak GNNs. Existing graph augmentation methods typically\naddress only one of these challenges, often adding training costs or relying on\noversimplified or knowledge-intensive strategies, limiting their\ngeneralization. To simultaneously address both challenges faced by graphs in a\ngeneralized way, we propose an elegant method called IntraMix. Considering the\nincompatibility of vanilla Mixup with the complex topology of graphs, IntraMix\ninnovatively employs Mixup among inaccurate labeled data of the same class,\ngenerating high-quality labeled data at minimal cost. Additionally, it finds\ndata with high confidence of being clustered into the same group as the\ngenerated data to serve as their neighbors, thereby enriching the neighborhoods\nof graphs. IntraMix efficiently tackles both issues faced by graphs and\nchallenges the prior notion of the limited effectiveness of Mixup in node\nclassification. IntraMix is a theoretically grounded plug-in-play method that\ncan be readily applied to all GNNs. Extensive experiments demonstrate the\neffectiveness of IntraMix across various GNNs and datasets. Our code is\navailable at: https://github.com/Zhengsh123/IntraMix.\n","authors":["Shenghe Zheng","Hongzhi Wang","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.00957v2.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2402.01763v3","updated":"2024-11-01T03:49:59Z","published":"2024-01-30T23:35:28Z","title":"When Large Language Models Meet Vector Databases: A Survey","summary":" This survey explores the synergistic potential of Large Language Models\n(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving\nresearch area. With the proliferation of LLMs comes a host of challenges,\nincluding hallucinations, outdated knowledge, prohibitive commercial\napplication costs, and memory issues. VecDBs emerge as a compelling solution to\nthese issues by offering an efficient means to store, retrieve, and manage the\nhigh-dimensional vector representations intrinsic to LLM operations. Through\nthis nuanced review, we delineate the foundational principles of LLMs and\nVecDBs and critically analyze their integration's impact on enhancing LLM\nfunctionalities. This discourse extends into a discussion on the speculative\nfuture developments in this domain, aiming to catalyze further research into\noptimizing the confluence of LLMs and VecDBs for advanced data handling and\nknowledge extraction capabilities.\n","authors":["Zhi Jing","Yongye Su","Yikun Han"],"pdf_url":"https://arxiv.org/pdf/2402.01763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23858v2","updated":"2024-11-01T03:49:57Z","published":"2024-10-31T12:07:52Z","title":"Neural Network Matrix Product Operator: A Multi-Dimensionally Integrable\n Machine Learning Potential","summary":" A neural network-based machine learning potential energy surface (PES)\nexpressed in a matrix product operator (NN-MPO) is proposed. The MPO form\nenables efficient evaluation of high-dimensional integrals that arise in\nsolving the time-dependent and time-independent Schr\\\"odinger equation and\neffectively overcomes the so-called curse of dimensionality. This starkly\ncontrasts with other neural network-based machine learning PES methods, such as\nmulti-layer perceptrons (MLPs), where evaluating high-dimensional integrals is\nnot straightforward due to the fully connected topology in their backbone\narchitecture. Nevertheless, the NN-MPO retains the high representational\ncapacity of neural networks. NN-MPO can achieve spectroscopic accuracy with a\ntest mean absolute error (MAE) of 3.03 cm$^{-1}$ for a fully coupled\nsix-dimensional ab initio PES, using only 625 training points distributed\nacross a 0 to 17,000 cm$^{-1}$ energy range. Our Python implementation is\navailable at https://github.com/KenHino/Pompon.\n","authors":["Kentaro Hino","Yuki Kurashige"],"pdf_url":"https://arxiv.org/pdf/2410.23858v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.02428v3","updated":"2024-11-01T03:47:51Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Achieving the effective design and improvement of reward functions in\nreinforcement learning (RL) tasks with complex custom environments and multiple\nrequirements presents considerable challenges. In this paper, we propose ERFSL,\nan efficient reward function searcher using LLMs, which enables LLMs to be\neffective white-box searchers and highlights their advanced semantic\nunderstanding capabilities. Specifically, we generate reward components for\neach numerically explicit user requirement and employ a reward critic to\nidentify the correct code form. Then, LLMs assign weights to the reward\ncomponents to balance their values and iteratively adjust the weights without\nambiguity and redundant adjustments by flexibly adopting directional mutation\nand crossover strategies, similar to genetic algorithms, based on the context\nprovided by the training log analyzer. We applied the framework to an\nunderwater data collection RL task without direct human feedback or reward\nexamples (zero-shot learning). The reward critic successfully corrects the\nreward code with only one feedback instance for each requirement, effectively\npreventing unrectifiable errors. The initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout the need for weight search. Even in cases where a weight is 500 times\noff, on average, only 5.2 iterations are needed to meet user requirements. The\nERFSL also works well with most prompts utilizing GPT-4o mini, as we decompose\nthe weight searching process to reduce the requirement for numerical and\nlong-context understanding capabilities\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08159v3","updated":"2024-11-01T03:44:33Z","published":"2024-07-11T03:25:40Z","title":"Model-agnostic clean-label backdoor mitigation in cybersecurity\n environments","summary":" The training phase of machine learning models is a delicate step, especially\nin cybersecurity contexts. Recent research has surfaced a series of insidious\ntraining-time attacks that inject backdoors in models designed for security\nclassification tasks without altering the training labels. With this work, we\npropose new techniques that leverage insights in cybersecurity threat models to\neffectively mitigate these clean-label poisoning attacks, while preserving the\nmodel utility. By performing density-based clustering on a carefully chosen\nfeature subspace, and progressively isolating the suspicious clusters through a\nnovel iterative scoring procedure, our defensive mechanism can mitigate the\nattacks without requiring many of the common assumptions in the existing\nbackdoor defense literature. To show the generality of our proposed mitigation,\nwe evaluate it on two clean-label model-agnostic attacks on two different\nclassic cybersecurity data modalities: network flows classification and malware\nclassification, using gradient boosting and neural network models.\n","authors":["Giorgio Severi","Simona Boboila","John Holodnak","Kendra Kratkiewicz","Rauf Izmailov","Michael J. De Lucia","Alina Oprea"],"pdf_url":"https://arxiv.org/pdf/2407.08159v3.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.04501v3","updated":"2024-11-01T03:42:37Z","published":"2024-10-06T14:45:01Z","title":"Leveraging Large Language Models for Suicide Detection on Social Media\n with Limited Labels","summary":" The increasing frequency of suicidal thoughts highlights the importance of\nearly detection and intervention. Social media platforms, where users often\nshare personal experiences and seek help, could be utilized to identify\nindividuals at risk. However, the large volume of daily posts makes manual\nreview impractical. This paper explores the use of Large Language Models (LLMs)\nto automatically detect suicidal content in text-based social media posts. We\npropose a novel method for generating pseudo-labels for unlabeled data by\nprompting LLMs, along with traditional classification fine-tuning techniques to\nenhance label accuracy. To create a strong suicide detection model, we develop\nan ensemble approach involving prompting with Qwen2-72B-Instruct, and using\nfine-tuned models such as Llama3-8B, Llama3.1-8B, and Gemma2-9B. We evaluate\nour approach on the dataset of the Suicide Ideation Detection on Social Media\nChallenge, a track of the IEEE Big Data 2024 Big Data Cup. Additionally, we\nconduct a comprehensive analysis to assess the impact of different models and\nfine-tuning strategies on detection performance. Experimental results show that\nthe ensemble model significantly improves the detection accuracy, by 5% points\ncompared with the individual models. It achieves a weight F1 score of 0.770 on\nthe public test set, and 0.731 on the private test set, providing a promising\nsolution for identifying suicidal content in social media. Our analysis shows\nthat the choice of LLMs affects the prompting performance, with larger models\nproviding better accuracy. Our code and checkpoints are publicly available at\nhttps://github.com/khanhvynguyen/Suicide_Detection_LLMs.\n","authors":["Vy Nguyen","Chau Pham"],"pdf_url":"https://arxiv.org/pdf/2410.04501v3.pdf","comment":"Accepted at IEEE International Conference on Big Data 2024"},{"id":"http://arxiv.org/abs/2405.16978v3","updated":"2024-11-01T03:40:24Z","published":"2024-05-27T09:21:40Z","title":"OSLO: One-Shot Label-Only Membership Inference Attacks","summary":" We introduce One-Shot Label-Only (OSLO) membership inference attacks (MIAs),\nwhich accurately infer a given sample's membership in a target model's training\nset with high precision using just \\emph{a single query}, where the target\nmodel only returns the predicted hard label. This is in contrast to\nstate-of-the-art label-only attacks which require $\\sim6000$ queries, yet get\nattack precisions lower than OSLO's. OSLO leverages transfer-based black-box\nadversarial attacks. The core idea is that a member sample exhibits more\nresistance to adversarial perturbations than a non-member. We compare OSLO\nagainst state-of-the-art label-only attacks and demonstrate that, despite\nrequiring only one query, our method significantly outperforms previous attacks\nin terms of precision and true positive rate (TPR) under the same false\npositive rates (FPR). For example, compared to previous label-only MIAs, OSLO\nachieves a TPR that is at least 7$\\times$ higher under a 1\\% FPR and at least\n22$\\times$ higher under a 0.1\\% FPR on CIFAR100 for a ResNet18 model. We\nevaluated multiple defense mechanisms against OSLO.\n","authors":["Yuefeng Peng","Jaechul Roh","Subhransu Maji","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2405.16978v3.pdf","comment":"To appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24184v2","updated":"2024-11-01T03:29:29Z","published":"2024-10-31T17:47:01Z","title":"Group Crosscoders for Mechanistic Analysis of Symmetry","summary":" We introduce group crosscoders, an extension of crosscoders that\nsystematically discover and analyse symmetrical features in neural networks.\nWhile neural networks often develop equivariant representations without\nexplicit architectural constraints, understanding these emergent symmetries has\ntraditionally relied on manual analysis. Group crosscoders automate this\nprocess by performing dictionary learning across transformed versions of inputs\nunder a symmetry group. Applied to InceptionV1's mixed3b layer using the\ndihedral group $\\mathrm{D}_{32}$, our method reveals several key insights:\nFirst, it naturally clusters features into interpretable families that\ncorrespond to previously hypothesised feature types, providing more precise\nseparation than standard sparse autoencoders. Second, our transform block\nanalysis enables the automatic characterisation of feature symmetries,\nrevealing how different geometric features (such as curves versus lines)\nexhibit distinct patterns of invariance and equivariance. These results\ndemonstrate that group crosscoders can provide systematic insights into how\nneural networks represent symmetry, offering a promising new tool for\nmechanistic interpretability.\n","authors":["Liv Gorton"],"pdf_url":"https://arxiv.org/pdf/2410.24184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19878v2","updated":"2024-11-01T03:26:07Z","published":"2024-10-24T13:58:59Z","title":"Parameter-Efficient Fine-Tuning in Large Models: A Survey of\n Methodologies","summary":" The large models, as predicted by scaling raw forecasts, have made\ngroundbreaking progress in many fields, particularly in natural language\ngeneration tasks, where they have approached or even surpassed human levels.\nHowever, the unprecedented scale of their parameters brings significant\ncomputational and storage costs. These large models require substantial\ncomputational resources and GPU memory to operate. When adapting large models\nto specific downstream tasks, their massive parameter scale poses a significant\nchallenge in fine-tuning on hardware platforms with limited computational power\nand GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT)\noffers a practical solution by efficiently adjusting the parameters of large\npre-trained models to suit various downstream tasks. Specifically, PEFT adjusts\nthe parameters of pre-trained large models to adapt to specific tasks or\ndomains, minimizing the introduction of additional parameters and the\ncomputational resources required. This review mainly introduces the preliminary\nknowledge of PEFT, the core ideas and principles of various PEFT algorithms,\nthe applications of PEFT, and potential future research directions. By reading\nthis review, we believe that interested parties can quickly grasp the PEFT\nmethodology, thereby accelerating its development and innovation.\n","authors":["Luping Wang","Sheng Chen","Linnan Jiang","Shu Pan","Runze Cai","Sen Yang","Fei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.19878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15711v2","updated":"2024-11-01T03:17:03Z","published":"2024-09-24T03:59:32Z","title":"Adversarial Federated Consensus Learning for Surface Defect\n Classification Under Data Heterogeneity in IIoT","summary":" The challenge of data scarcity hinders the application of deep learning in\nindustrial surface defect classification (SDC), as it's difficult to collect\nand centralize sufficient training data from various entities in Industrial\nInternet of Things (IIoT) due to privacy concerns. Federated learning (FL)\nprovides a solution by enabling collaborative global model training across\nclients while maintaining privacy. However, performance may suffer due to data\nheterogeneity-discrepancies in data distributions among clients. In this paper,\nwe propose a novel personalized FL (PFL) approach, named Adversarial Federated\nConsensus Learning (AFedCL), for the challenge of data heterogeneity across\ndifferent clients in SDC. First, we develop a dynamic consensus construction\nstrategy to mitigate the performance degradation caused by data heterogeneity.\nThrough adversarial training, local models from different clients utilize the\nglobal model as a bridge to achieve distribution alignment, alleviating the\nproblem of global knowledge forgetting. Complementing this strategy, we propose\na consensus-aware aggregation mechanism. It assigns aggregation weights to\ndifferent clients based on their efficacy in global knowledge learning, thereby\nenhancing the global model's generalization capabilities. Finally, we design an\nadaptive feature fusion module to further enhance global knowledge utilization\nefficiency. Personalized fusion weights are gradually adjusted for each client\nto optimally balance global and local features. Compared with state-of-the-art\nFL methods like FedALA, the proposed AFedCL method achieves an accuracy\nincrease of up to 5.67% on three SDC datasets.\n","authors":["Jixuan Cui","Jun Li","Zhen Mei","Yiyang Ni","Wen Chen","Zengxiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.15711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04391v8","updated":"2024-11-01T02:49:24Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The dev dataset evaluation\nresults and human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23749v2","updated":"2024-11-01T02:47:29Z","published":"2024-10-31T09:09:39Z","title":"LSEAttention is All You Need for Time Series Forecasting","summary":" Transformer-based architectures have achieved remarkable success in natural\nlanguage processing and computer vision. However, their performance in\nmultivariate long-term forecasting often lags behind simpler linear baselines.\nPrevious studies have identified the traditional attention mechanism as a\nsignificant factor contributing to this limitation. To unlock the full\npotential of transformers for multivariate time series forecasting, I introduce\n\\textbf{LSEAttention}, an approach designed to address entropy collapse and\ntraining instability commonly observed in transformer models. I validate the\neffectiveness of LSEAttention across various real-world multivariate time\nseries datasets, demonstrating that it not only outperforms existing time\nseries transformer models but also exceeds the performance of some\nstate-of-the-art models on specific datasets.\n","authors":["Dizhen Liang"],"pdf_url":"https://arxiv.org/pdf/2410.23749v2.pdf","comment":"7 pages with referencing, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2409.17508v2","updated":"2024-11-01T02:38:53Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization in MLLMs, recent advances primarily focus on improving\nthe LLM components, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector in MLLMs. Extensive ablation\nexperiments validate the effectiveness of introducing CMoE under any\nconfiguration, with up to an average 8% performance gains. We further provide\ninterpretation analysis of the tug-of-war problem from the perspective of\ngradient optimization and parameter statistics. Compared to previous\nstate-of-the-art medical MLLMs, Uni-Med achieves competitive or superior\nevaluation metrics on diverse tasks. Code and resources are available at\nhttps://github.com/tsinghua-msiip/Uni-Med.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14909v2","updated":"2024-11-01T02:26:18Z","published":"2024-06-21T06:58:37Z","title":"MoA: Mixture of Sparse Attention for Automatic Large Language Model\n Compression","summary":" Sparse attention can effectively mitigate the significant memory and\nthroughput demands of Large Language Models (LLMs) in long contexts. Existing\nmethods typically employ a uniform sparse attention mask, applying the same\nsparse pattern across different attention heads and input lengths. However,\nthis uniform approach fails to capture the diverse attention patterns inherent\nin LLMs, ignoring their distinct accuracy-latency trade-offs. To address this\nchallenge, we propose the Mixture of Attention (MoA), which automatically\ntailors distinct sparse attention configurations to different heads and layers.\nMoA constructs and navigates a search space of various attention patterns and\ntheir scaling rules relative to input sequence lengths. It profiles the model,\nevaluates potential configurations, and pinpoints the optimal sparse attention\ncompression plan. MoA adapts to varying input sizes, revealing that some\nattention heads expand their focus to accommodate longer sequences, while other\nheads consistently concentrate on fixed-length local contexts. Experiments show\nthat MoA increases the effective context length by $3.9\\times$ with the same\naverage attention span, boosting retrieval accuracy by $1.5-7.1\\times$ over the\nuniform-attention baseline across Vicuna-{7B,13B}, and Llama3-{8B,70B} models.\nMoreover, MoA narrows the capability gaps between sparse and dense models,\nreducing the maximum relative performance drop from $9\\%-36\\%$ to within $5\\%$\nacross two long-context understanding benchmarks. MoA achieves a\n$1.2-1.4\\times$ GPU memory reduction, boosting decode throughput by\n$6.6-8.2\\times$ and $1.7-1.9\\times$ compared to FlashAttention2 and vLLM, with\nminimal impact on performance. Our code is available at\n\\url{https://github.com/thu-nics/MoA}.\n","authors":["Tianyu Fu","Haofeng Huang","Xuefei Ning","Genghan Zhang","Boju Chen","Tianqi Wu","Hongyi Wang","Zixiao Huang","Shiyao Li","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.14909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18451v3","updated":"2024-11-01T02:13:59Z","published":"2024-06-26T16:00:35Z","title":"Detecting Brittle Decisions for Free: Leveraging Margin Consistency in\n Deep Robust Classifiers","summary":" Despite extensive research on adversarial training strategies to improve\nrobustness, the decisions of even the most robust deep learning models can\nstill be quite sensitive to imperceptible perturbations, creating serious risks\nwhen deploying them for high-stakes real-world applications. While detecting\nsuch cases may be critical, evaluating a model's vulnerability at a\nper-instance level using adversarial attacks is computationally too intensive\nand unsuitable for real-time deployment scenarios. The input space margin is\nthe exact score to detect non-robust samples and is intractable for deep neural\nnetworks. This paper introduces the concept of margin consistency -- a property\nthat links the input space margins and the logit margins in robust models --\nfor efficient detection of vulnerable samples. First, we establish that margin\nconsistency is a necessary and sufficient condition to use a model's logit\nmargin as a score for identifying non-robust samples. Next, through\ncomprehensive empirical analysis of various robustly trained models on CIFAR10\nand CIFAR100 datasets, we show that they indicate high margin consistency with\na strong correlation between their input space margins and the logit margins.\nThen, we show that we can effectively and confidently use the logit margin to\ndetect brittle decisions with such models. Finally, we address cases where the\nmodel is not sufficiently margin-consistent by learning a pseudo-margin from\nthe feature representation. Our findings highlight the potential of leveraging\ndeep representations to assess adversarial vulnerability in deployment\nscenarios efficiently.\n","authors":["Jonas Ngnawé","Sabyasachi Sahoo","Yann Pequignot","Frédéric Precioso","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2406.18451v3.pdf","comment":"10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready"},{"id":"http://arxiv.org/abs/2006.11444v2","updated":"2024-11-01T02:12:36Z","published":"2020-06-20T00:17:44Z","title":"Optimizing Monotone Chance-Constrained Submodular Functions Using\n Evolutionary Multi-Objective Algorithms","summary":" Many real-world optimization problems can be stated in terms of submodular\nfunctions. Furthermore, these real-world problems often involve uncertainties\nwhich may lead to the violation of given constraints. A lot of evolutionary\nmulti-objective algorithms following the Pareto optimization approach have\nrecently been analyzed and applied to submodular problems with different types\nof constraints. We present a first runtime analysis of evolutionary\nmulti-objective algorithms based on Pareto optimization for chance-constrained\nsubmodular functions. Here the constraint involves stochastic components and\nthe constraint can only be violated with a small probability of alpha. We\ninvestigate the classical GSEMO algorithm for two different bi-objective\nformulations using tail bounds to determine the feasibility of solutions. We\nshow that the algorithm GSEMO obtains the same worst case performance\nguarantees for monotone submodular functions as recently analyzed greedy\nalgorithms for the case of uniform IID weights and uniformly distributed\nweights with the same dispersion when using the appropriate bi-objective\nformulation. As part of our investigations, we also point out situations where\nthe use of tail bounds in the first bi-objective formulation can prevent GSEMO\nfrom obtaining good solutions in the case of uniformly distributed weights with\nthe same dispersion if the objective function is submodular but non-monotone\ndue to a single element impacting monotonicity. Furthermore, we investigate the\nbehavior of the evolutionary multi-objective algorithms GSEMO, NSGA-II and\nSPEA2 on different submodular chance-constrained network problems. Our\nexperimental results show that the use of evolutionary multi-objective\nalgorithms leads to significant performance improvements compared to\nstate-of-the-art greedy algorithms for submodular optimization.\n","authors":["Aneta Neumann","Frank Neumann"],"pdf_url":"https://arxiv.org/pdf/2006.11444v2.pdf","comment":"To appear in the Evolutionary Computation Journal 2024"},{"id":"http://arxiv.org/abs/2205.10287v3","updated":"2024-11-01T02:01:18Z","published":"2022-05-20T16:39:03Z","title":"On the SDEs and Scaling Rules for Adaptive Gradient Algorithms","summary":" Approximating Stochastic Gradient Descent (SGD) as a Stochastic Differential\nEquation (SDE) has allowed researchers to enjoy the benefits of studying a\ncontinuous optimization trajectory while carefully preserving the stochasticity\nof SGD. Analogous study of adaptive gradient methods, such as RMSprop and Adam,\nhas been challenging because there were no rigorously proven SDE approximations\nfor these methods. This paper derives the SDE approximations for RMSprop and\nAdam, giving theoretical guarantees of their correctness as well as\nexperimental validation of their applicability to common large-scaling vision\nand language settings. A key practical result is the derivation of a\n$\\textit{square root scaling rule}$ to adjust the optimization hyperparameters\nof RMSprop and Adam when changing batch size, and its empirical validation in\ndeep learning settings.\n","authors":["Sadhika Malladi","Kaifeng Lyu","Abhishek Panigrahi","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2205.10287v3.pdf","comment":"revised for correcting errors in some figures"},{"id":"http://arxiv.org/abs/2405.00636v2","updated":"2024-11-01T01:55:41Z","published":"2024-05-01T17:04:20Z","title":"Robustness of graph embedding methods for community detection","summary":" This study investigates the robustness of graph embedding methods for\ncommunity detection in the face of network perturbations, specifically edge\ndeletions. Graph embedding techniques, which represent nodes as low-dimensional\nvectors, are widely used for various graph machine learning tasks due to their\nability to capture structural properties of networks effectively. However, the\nimpact of perturbations on the performance of these methods remains relatively\nunderstudied. The research considers state-of-the-art graph embedding methods\nfrom two families: matrix factorization (e.g., LE, LLE, HOPE, M-NMF) and random\nwalk-based (e.g., DeepWalk, LINE, node2vec). Through experiments conducted on\nboth synthetic and real-world networks, the study reveals varying degrees of\nrobustness within each family of graph embedding methods. The robustness is\nfound to be influenced by factors such as network size, initial community\npartition strength, and the type of perturbation. Notably, node2vec and LLE\nconsistently demonstrate higher robustness for community detection across\ndifferent scenarios, including networks with degree and community size\nheterogeneity. These findings highlight the importance of selecting an\nappropriate graph embedding method based on the specific characteristics of the\nnetwork and the task at hand, particularly in scenarios where robustness to\nperturbations is crucial.\n","authors":["Zhi-Feng Wei","Pablo Moriano","Ramakrishnan Kannan"],"pdf_url":"https://arxiv.org/pdf/2405.00636v2.pdf","comment":"17 pages, 26 figures, 3 tables. Comments are welcome"},{"id":"http://arxiv.org/abs/2406.06976v2","updated":"2024-11-01T01:51:35Z","published":"2024-06-11T06:16:33Z","title":"Discrete Dictionary-based Decomposition Layer for Structured\n Representation Learning","summary":" Neuro-symbolic neural networks have been extensively studied to integrate\nsymbolic operations with neural networks, thereby improving systematic\ngeneralization. Specifically, Tensor Product Representation (TPR) framework\nenables neural networks to perform differentiable symbolic operations by\nencoding the symbolic structure of data within vector spaces. However,\nTPR-based neural networks often struggle to decompose unseen data into\nstructured TPR representations, undermining their symbolic operations. To\naddress this decomposition problem, we propose a Discrete Dictionary-based\nDecomposition (D3) layer designed to enhance the decomposition capabilities of\nTPR-based models. D3 employs discrete, learnable key-value dictionaries trained\nto capture symbolic features essential for decomposition operations. It\nleverages the prior knowledge acquired during training to generate structured\nTPR representations by mapping input data to pre-learned symbolic features\nwithin these dictionaries. D3 is a straightforward drop-in layer that can be\nseamlessly integrated into any TPR-based model without modifications. Our\nexperimental results demonstrate that D3 significantly improves the systematic\ngeneralization of various TPR-based models while requiring fewer additional\nparameters. Notably, D3 outperforms baseline models on the synthetic task that\ndemands the systematic decomposition of unseen combinatorial data.\n","authors":["Taewon Park","Hyun-Chul Kim","Minho Lee"],"pdf_url":"https://arxiv.org/pdf/2406.06976v2.pdf","comment":"Published in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.18781v2","updated":"2024-11-01T01:45:27Z","published":"2024-05-29T05:41:28Z","title":"On the Role of Attention Masks and LayerNorm in Transformers","summary":" Self-attention is the key mechanism of transformers, which are the essential\nbuilding blocks of modern foundation models. Recent studies have shown that\npure self-attention suffers from an increasing degree of rank collapse as depth\nincreases, limiting model expressivity and further utilization of model depth.\nThe existing literature on rank collapse, however, has mostly overlooked other\ncritical components in transformers that may alleviate the rank collapse issue.\nIn this paper, we provide a general analysis of rank collapse under\nself-attention, taking into account the effects of attention masks and layer\nnormalization (LayerNorm). In particular, we find that although pure masked\nattention still suffers from exponential collapse to a rank one subspace,\nsparse or local masked attention can provably slow down the collapse rate. In\nthe case of self-attention with LayerNorm, we first show that for certain\nclasses of value matrices, collapse to a rank one subspace still happens\nexponentially. However, through construction of nontrivial counterexamples, we\nthen establish that with proper choice of value matrices, a general class of\nsequences may not converge to a rank one subspace, and the self-attention\ndynamics with LayerNorm can simultaneously possess a rich set of equilibria\nwith any possible rank between one and full. Our result refutes the previous\nhypothesis that LayerNorm plays no role in the rank collapse of self-attention\nand suggests that self-attention with LayerNorm constitutes a much more\nexpressive, versatile nonlinear dynamical system than what was originally\nthought.\n","authors":["Xinyi Wu","Amir Ajorlou","Yifei Wang","Stefanie Jegelka","Ali Jadbabaie"],"pdf_url":"https://arxiv.org/pdf/2405.18781v2.pdf","comment":"NeurIPS 2024. Fixed errors in v1 and added new remarks"},{"id":"http://arxiv.org/abs/2410.20595v2","updated":"2024-11-01T01:27:10Z","published":"2024-10-27T21:02:37Z","title":"A Framework for Real-Time Volcano-Seismic Event Recognition Based on\n Multi-Station Seismograms and Semantic Segmentation Models","summary":" In volcano monitoring, effective recognition of seismic events is essential\nfor understanding volcanic activity and raising timely warning alerts.\nTraditional methods rely on manual analysis, which can be subjective and\nlabor-intensive. Furthermore, current automatic approaches often tackle\ndetection and classification separately, mostly rely on single station\ninformation and generally require tailored preprocessing and representations to\nperform predictions. These limitations often hinder their application to\nreal-time monitoring and utilization across different volcano conditions. This\nstudy introduces a novel approach that utilizes Semantic Segmentation models to\nautomate seismic event recognition by applying a straight forward\ntransformation of multi-channel 1D signals into 2D representations, enabling\ntheir use as images. Our framework employs a data-driven, end-to-end design\nthat integrates multi-station seismic data with minimal preprocessing,\nperforming both detection and classification simultaneously for five seismic\nevent classes. We evaluated four state-of-the-art segmentation models (UNet,\nUNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events\nrecorded at four different Chilean volcanoes: Nevados del Chill\\'an Volcanic\nComplex, Laguna del Maule, Villarrica and Puyehue-Cord\\'on Caulle. Among these\nmodels, the UNet architecture was identified as the most effective model,\nachieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and\n0.88, respectively, and demonstrating superior noise robustness and model\nflexibility to unseen volcano datasets.\n","authors":["Camilo Espinosa-Curilem","Millaray Curilem","Daniel Basualto"],"pdf_url":"https://arxiv.org/pdf/2410.20595v2.pdf","comment":"10 pages, 9 figures. This is a pre-print, it is currently under\n review for publication"},{"id":"http://arxiv.org/abs/2406.05972v2","updated":"2024-11-01T00:50:56Z","published":"2024-06-10T02:14:19Z","title":"Decision-Making Behavior Evaluation Framework for LLMs under Uncertain\n Context","summary":" When making decisions under uncertainty, individuals often deviate from\nrational behavior, which can be evaluated across three dimensions: risk\npreference, probability weighting, and loss aversion. Given the widespread use\nof large language models (LLMs) in decision-making processes, it is crucial to\nassess whether their behavior aligns with human norms and ethical expectations\nor exhibits potential biases. Several empirical studies have investigated the\nrationality and social behavior performance of LLMs, yet their internal\ndecision-making tendencies and capabilities remain inadequately understood.\nThis paper proposes a framework, grounded in behavioral economics, to evaluate\nthe decision-making behaviors of LLMs. Through a multiple-choice-list\nexperiment, we estimate the degree of risk preference, probability weighting,\nand loss aversion in a context-free setting for three commercial LLMs:\nChatGPT-4.0-Turbo, Claude-3-Opus, and Gemini-1.0-pro. Our results reveal that\nLLMs generally exhibit patterns similar to humans, such as risk aversion and\nloss aversion, with a tendency to overweight small probabilities. However,\nthere are significant variations in the degree to which these behaviors are\nexpressed across different LLMs. We also explore their behavior when embedded\nwith socio-demographic features, uncovering significant disparities. For\ninstance, when modeled with attributes of sexual minority groups or physical\ndisabilities, Claude-3-Opus displays increased risk aversion, leading to more\nconservative choices. These findings underscore the need for careful\nconsideration of the ethical implications and potential biases in deploying\nLLMs in decision-making scenarios. Therefore, this study advocates for\ndeveloping standards and guidelines to ensure that LLMs operate within ethical\nboundaries while enhancing their utility in complex decision-making\nenvironments.\n","authors":["Jingru Jia","Zehua Yuan","Junhao Pan","Paul E. McNamara","Deming Chen"],"pdf_url":"https://arxiv.org/pdf/2406.05972v2.pdf","comment":"Jingru Jia and Zehua Yuan have equal contribution"},{"id":"http://arxiv.org/abs/2406.09371v2","updated":"2024-11-01T00:22:26Z","published":"2024-06-13T17:51:00Z","title":"LRM-Zero: Training Large Reconstruction Models with Synthesized Data","summary":" We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on\nsynthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The\ncore of LRM-Zero is our procedural 3D dataset, Zeroverse, which is\nautomatically synthesized from simple primitive shapes with random texturing\nand augmentations (e.g., height fields, boolean differences, and wireframes).\nUnlike previous 3D datasets (e.g., Objaverse) which are often captured or\ncrafted by humans to approximate real 3D data, Zeroverse completely ignores\nrealistic global semantics but is rich in complex geometric and texture details\nthat are locally similar to or even more intricate than real objects. We\ndemonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse,\ncan achieve high visual quality in the reconstruction of real-world objects,\ncompetitive with models trained on Objaverse. We also analyze several critical\ndesign choices of Zeroverse that contribute to LRM-Zero's capability and\ntraining stability. Our work demonstrates that 3D reconstruction, one of the\ncore tasks in 3D vision, can potentially be addressed without the semantics of\nreal-world objects. The Zeroverse's procedural synthesis code and interactive\nvisualization are available at: https://desaixie.github.io/lrm-zero/.\n","authors":["Desai Xie","Sai Bi","Zhixin Shu","Kai Zhang","Zexiang Xu","Yi Zhou","Sören Pirk","Arie Kaufman","Xin Sun","Hao Tan"],"pdf_url":"https://arxiv.org/pdf/2406.09371v2.pdf","comment":"23 pages, 8 figures. Our code and interactive visualization are\n available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera\n Ready version"},{"id":"http://arxiv.org/abs/2406.17763v2","updated":"2024-11-01T00:08:54Z","published":"2024-06-25T17:48:24Z","title":"DiffusionPDE: Generative PDE-Solving Under Partial Observation","summary":" We introduce a general framework for solving partial differential equations\n(PDEs) using generative diffusion models. In particular, we focus on the\nscenarios where we do not have the full knowledge of the scene necessary to\napply classical solvers. Most existing forward or inverse PDE approaches\nperform poorly when the observations on the data or the underlying coefficients\nare incomplete, which is a common assumption for real-world measurements. In\nthis work, we propose DiffusionPDE that can simultaneously fill in the missing\ninformation and solve a PDE by modeling the joint distribution of the solution\nand coefficient spaces. We show that the learned generative priors lead to a\nversatile framework for accurately solving a wide range of PDEs under partial\nobservation, significantly outperforming the state-of-the-art methods for both\nforward and inverse directions.\n","authors":["Jiahe Huang","Guandao Yang","Zichen Wang","Jeong Joon Park"],"pdf_url":"https://arxiv.org/pdf/2406.17763v2.pdf","comment":"NeurIPS 2024. Project page:\n https://jhhuangchloe.github.io/Diffusion-PDE/"},{"id":"http://arxiv.org/abs/2406.16218v2","updated":"2024-11-01T00:01:01Z","published":"2024-06-23T21:05:31Z","title":"Trace is the Next AutoDiff: Generative Optimization with Rich Feedback,\n Execution Traces, and LLMs","summary":" We study a class of optimization problems motivated by automating the design\nand update of AI systems like coding assistants, robots, and copilots. AutoDiff\nframeworks, like PyTorch, enable efficient end-to-end optimization of\ndifferentiable systems. However, general computational workflows can be\nnon-differentiable and involve rich feedback (e.g. console output or user's\nresponses), heterogeneous parameters (e.g. prompts, codes), and intricate\nobjectives (beyond maximizing a score). We investigate end-to-end generative\noptimization -- using generative models such as LLMs within the optimizer for\nautomatic updating of general computational workflows. We discover that\nworkflow execution traces are akin to back-propagated gradients in AutoDiff and\ncan provide key information to interpret feedback for efficient optimization.\nFormally, we frame a new mathematical setup, Optimization with Trace Oracle\n(OPTO). In OPTO, an optimizer receives an execution trace along with feedback\non the computed output and updates parameters iteratively. We provide a Python\nlibrary, Trace, that efficiently converts a workflow optimization problem into\nan OPTO instance using PyTorch-like syntax. Using Trace, we develop a general\nLLM-based generative optimizer called OptoPrime. In empirical studies, we find\nthat OptoPrime is capable of first-order numerical optimization, prompt\noptimization, hyper-parameter tuning, robot controller design, code debugging,\netc., and is often competitive with specialized optimizers for each domain. We\nenvision Trace as an open research platform for devising novel generative\noptimizers and developing the next generation of interactive learning agents.\nWebsite: https://microsoft.github.io/Trace/.\n","authors":["Ching-An Cheng","Allen Nie","Adith Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2406.16218v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.20012v2","updated":"2024-11-01T08:40:28Z","published":"2024-09-30T07:14:31Z","title":"Towards Robust Multimodal Sentiment Analysis with Incomplete Data","summary":" The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an\nemerging direction seeking to tackle the issue of data incompleteness.\nRecognizing that the language modality typically contains dense sentiment\ninformation, we consider it as the dominant modality and present an innovative\nLanguage-dominated Noise-resistant Learning Network (LNLN) to achieve robust\nMSA. The proposed LNLN features a dominant modality correction (DMC) module and\ndominant modality based multimodal learning (DMML) module, which enhances the\nmodel's robustness across various noise scenarios by ensuring the quality of\ndominant modality representations. Aside from the methodical design, we perform\ncomprehensive experiments under random data missing scenarios, utilizing\ndiverse and meaningful settings on several popular datasets (\\textit{e.g.,}\nMOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and\nfairness compared to existing evaluations in the literature. Empirically, LNLN\nconsistently outperforms existing baselines, demonstrating superior performance\nacross these challenging and extensive evaluation metrics.\n","authors":["Haoyu Zhang","Wenbin Wang","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2409.20012v2.pdf","comment":"Accepted to NeurIPS 2024"}]},"2024-10-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.06362v2","updated":"2024-10-31T23:55:10Z","published":"2023-11-10T19:27:20Z","title":"Word Definitions from Large Language Models","summary":" Dictionary definitions are historically the arbitrator of what words mean,\nbut this primacy has come under threat by recent progress in NLP, including\nword embeddings and generative models like ChatGPT. We present an exploratory\nstudy of the degree of alignment between word definitions from classical\ndictionaries and these newer computational artifacts. Specifically, we compare\ndefinitions from three published dictionaries to those generated from variants\nof ChatGPT. We show that (i) definitions from different traditional\ndictionaries exhibit more surface form similarity than do model-generated\ndefinitions, (ii) that the ChatGPT definitions are highly accurate, comparable\nto traditional dictionaries, and (iii) ChatGPT-based embedding definitions\nretain their accuracy even on low frequency words, much better than GloVE and\nFastText word embeddings.\n","authors":["Bach Pham","JuiHsuan Wong","Samuel Kim","Yunting Yin","Steven Skiena"],"pdf_url":"https://arxiv.org/pdf/2311.06362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01006v2","updated":"2024-10-31T23:44:31Z","published":"2024-06-03T05:36:57Z","title":"SemCoder: Training Code Language Models with Comprehensive Semantics\n Reasoning","summary":" Code Large Language Models (Code LLMs) have excelled at tasks like code\ncompletion but often miss deeper semantics such as execution effects and\ndynamic states. This paper aims to bridge the gap between Code LLMs' reliance\non static text data and the need for semantic understanding for complex tasks\nlike debugging and program repair. We introduce a novel strategy, monologue\nreasoning, to train Code LLMs to reason comprehensive semantics, encompassing\nhigh-level functional descriptions, local execution effects of individual\nstatements, and overall input/output behavior, thereby linking static code text\nwith dynamic execution states. We begin by collecting PyX, a clean Python\ncorpus of fully executable code samples with functional descriptions and test\ncases. We propose training Code LLMs not only to write code but also to\nunderstand code semantics by reasoning about key properties, constraints, and\nexecution behaviors using natural language, mimicking human verbal debugging,\ni.e., rubber-duck debugging. This approach led to the development of SemCoder,\na Code LLM with only 6.7B parameters, which shows competitive performance with\nGPT-3.5-turbo on code generation and execution reasoning tasks. SemCoder\nachieves 79.3% on HumanEval (GPT-3.5-turbo: 76.8%), 63.6% on CRUXEval-I\n(GPT-3.5-turbo: 50.3%), and 63.9% on CRUXEval-O (GPT-3.5-turbo: 59.0%). We also\nstudy the effectiveness of SemCoder's monologue-style execution reasoning\ncompared to concrete scratchpad reasoning, showing that our approach integrates\nsemantics from multiple dimensions more smoothly. Finally, we demonstrate the\npotential of applying learned semantics to improve Code LLMs' debugging and\nself-refining capabilities. Our data, code, and models are available at:\nhttps://github.com/ARiSE-Lab/SemCoder.\n","authors":["Yangruibo Ding","Jinjun Peng","Marcus J. Min","Gail Kaiser","Junfeng Yang","Baishakhi Ray"],"pdf_url":"https://arxiv.org/pdf/2406.01006v2.pdf","comment":"NeurIPS 2024 Camera-ready"},{"id":"http://arxiv.org/abs/2405.18406v3","updated":"2024-10-31T23:27:09Z","published":"2024-05-28T17:46:36Z","title":"RACCooN: A Versatile Instructional Video Editing Framework with\n Auto-Generated Narratives","summary":" Recent video generative models primarily rely on carefully written text\nprompts for specific tasks, like inpainting or style editing. They require\nlabor-intensive textual descriptions for input videos, hindering their\nflexibility to adapt personal/raw videos to user specifications. This paper\nproposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video\ngenerative framework that supports multiple video editing capabilities such as\nremoval, addition, and modification, through a unified pipeline. RACCooN\nconsists of two principal stages: Video-to-Paragraph (V2P) and\nParagraph-to-Video (P2V). In the V2P stage, we automatically describe video\nscenes in well-structured natural language, capturing both the holistic context\nand focused object details. Subsequently, in the P2V stage, users can\noptionally refine these descriptions to guide the video diffusion model,\nenabling various modifications to the input video, such as removing, changing\nsubjects, and/or adding new objects. The proposed approach stands out from\nother methods through several significant contributions: (1) RACCooN suggests a\nmulti-granular spatiotemporal pooling strategy to generate well-structured\nvideo descriptions, capturing both the broad context and object details without\nrequiring complex human annotations, simplifying precise video content editing\nbased on text for users. (2) Our video generative model incorporates\nauto-generated narratives or instructions to enhance the quality and accuracy\nof the generated content. (3) RACCooN also plans to imagine new objects in a\ngiven video, so users simply prompt the model to receive a detailed video\nediting plan for complex video editing. The proposed framework demonstrates\nimpressive versatile capabilities in video-to-paragraph generation, video\ncontent editing, and can be incorporated into other SoTA video generative\nmodels for further enhancement.\n","authors":["Jaehong Yoon","Shoubin Yu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2405.18406v3.pdf","comment":"The first two authors contribute equally. Project Page:\n https://raccoon-mllm-gen.github.io/"},{"id":"http://arxiv.org/abs/2406.07791v6","updated":"2024-10-31T23:10:55Z","published":"2024-06-12T01:12:28Z","title":"Judging the Judges: A Systematic Investigation of Position Bias in\n Pairwise Comparative Assessments by LLMs","summary":" LLM-as-a-Judge presents a promising alternative to human evaluators across\nvarious tasks, but inherent biases, especially position bias - a tendency to\nfavor solutions based on their position in the prompt - have compromised its\neffectiveness. Our study introduces a systematic framework to examine position\nbias in pairwise comparisons, focusing on repetition stability, position\nconsistency, and preference fairness. This research significantly contributes\nto the field by introducing new concepts for understanding position bias and\nproviding a multi-dimensional framework for evaluations. We conducted\nexperiments with 12 LLM judges across MTBench and DevBench, covering 22 tasks\nand approximately 40 solution-generating models - candidates, resulting in over\n100,000 evaluation instances. Our findings confirm that position bias in\ncapable LLM judges is not due to random chances, along with notable variations\nobserved across judges and tasks. Moreover, position bias is weakly influenced\nby the length of prompt components but significantly impacted by the quality\ngap between solutions. These insights can help optimize judge model selections,\nimprove benchmark design, and inform future research on debiasing strategies,\nultimately enhancing the reliability of LLM judges.\n","authors":["Lin Shi","Chiyu Ma","Wenhua Liang","Weicheng Ma","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2406.07791v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12036v2","updated":"2024-10-31T23:08:03Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13770v2","updated":"2024-10-31T21:21:26Z","published":"2024-06-19T18:38:11Z","title":"Elliptical Attention","summary":" Pairwise dot-product self-attention is key to the success of transformers\nthat achieve state-of-the-art performance across a variety of applications in\nlanguage and vision. This dot-product self-attention computes attention weights\namong the input tokens using Euclidean distance, which makes the model prone to\nrepresentation collapse and vulnerable to contaminated samples. In this paper,\nwe propose using a Mahalanobis distance metric for computing the attention\nweights to stretch the underlying feature space in directions of high\ncontextual relevance. In particular, we define a hyper-ellipsoidal neighborhood\naround each query to increase the attention weights of the tokens lying in the\ncontextually important directions. We term this novel class of attention\nElliptical Attention. Our Elliptical Attention provides two benefits: 1)\nreducing representation collapse and 2) enhancing the model's robustness as\nElliptical Attention pays more attention to contextually relevant information\nrather than focusing on some small subset of informative features. We\nempirically demonstrate the advantages of Elliptical Attention over the\nbaseline dot-product attention and state-of-the-art attention methods on\nvarious practical tasks, including object classification, image segmentation,\nand language modeling across different data modalities.\n","authors":["Stefan K. Nielsen","Laziz U. Abdullaev","Rachel S. Y. Teo","Tan M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.13770v2.pdf","comment":"10 pages in the main text. Published at NeurIPS 2024. The code is\n available at https://github.com/stefvk/Elliptical-Attention"},{"id":"http://arxiv.org/abs/2406.09277v2","updated":"2024-10-31T20:45:16Z","published":"2024-06-13T16:15:53Z","title":"End-to-end streaming model for low-latency speech anonymization","summary":" Speaker anonymization aims to conceal cues to speaker identity while\npreserving linguistic content. Current machine learning based approaches\nrequire substantial computational resources, hindering real-time streaming\napplications. To address these concerns, we propose a streaming model that\nachieves speaker anonymization with low latency. The system is trained in an\nend-to-end autoencoder fashion using a lightweight content encoder that\nextracts HuBERT-like information, a pretrained speaker encoder that extract\nspeaker identity, and a variance encoder that injects pitch and energy\ninformation. These three disentangled representations are fed to a decoder that\nre-synthesizes the speech signal. We present evaluation results from two\nimplementations of our system, a full model that achieves a latency of 230ms,\nand a lite version (0.1x in size) that further reduces latency to 66ms while\nmaintaining state-of-the-art performance in naturalness, intelligibility, and\nprivacy preservation.\n","authors":["Waris Quamer","Ricardo Gutierrez-Osuna"],"pdf_url":"https://arxiv.org/pdf/2406.09277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17537v2","updated":"2024-10-31T20:07:53Z","published":"2024-05-27T17:57:48Z","title":"CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale","summary":" Measuring biodiversity is crucial for understanding ecosystem health. While\nprior works have developed machine learning models for taxonomic classification\nof photographic images and DNA separately, in this work, we introduce a\nmultimodal approach combining both, using CLIP-style contrastive learning to\nalign images, barcode DNA, and text-based representations of taxonomic labels\nin a unified embedding space. This allows for accurate classification of both\nknown and unknown insect species without task-specific fine-tuning, leveraging\ncontrastive learning for the first time to fuse DNA and image data. Our method\nsurpasses previous single-modality approaches in accuracy by over 8% on\nzero-shot learning tasks, showcasing its effectiveness in biodiversity studies.\n","authors":["ZeMing Gong","Austin T. Wang","Xiaoliang Huo","Joakim Bruslund Haurum","Scott C. Lowe","Graham W. Taylor","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.17537v2.pdf","comment":"25 pages with 11 figures"},{"id":"http://arxiv.org/abs/2402.17946v4","updated":"2024-10-31T19:38:15Z","published":"2024-02-28T00:09:07Z","title":"SparseLLM: Towards Global Pruning for Pre-trained Language Models","summary":" The transformative impact of large language models (LLMs) like LLaMA and GPT\non natural language processing is countered by their prohibitive computational\ndemands. Pruning has emerged as a pivotal compression strategy, introducing\nsparsity to enhance both memory and computational efficiency. Yet, traditional\nglobal pruning is impractical for LLMs due to scalability issues, while local\npruning, despite its efficiency, leads to suboptimal solutions. Addressing\nthese challenges, we propose SparseLLM, a novel framework that redefines the\nglobal pruning process into manageable, coordinated subproblems, allowing for\nresource-efficient optimization with global optimality. SparseLLM's approach,\nwhich conceptualizes LLMs as a chain of modular functions and leverages\nauxiliary variables for problem decomposition, not only facilitates a pragmatic\napplication on LLMs but also demonstrates significant performance improvements,\nparticularly in high-sparsity regimes where it surpasses current\nstate-of-the-art methods.\n","authors":["Guangji Bai","Yijiang Li","Chen Ling","Kibaek Kim","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.17946v4.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.06484v3","updated":"2024-10-31T19:35:47Z","published":"2024-06-10T17:24:42Z","title":"Parallelizing Linear Transformers with the Delta Rule over Sequence\n Length","summary":" Transformers with linear attention (i.e., linear transformers) and\nstate-space models have recently been suggested as a viable linear-time\nalternative to transformers with softmax attention. However, these models still\nunderperform transformers especially on tasks that require in-context\nretrieval. While more expressive variants of linear transformers which replace\nthe additive update in linear transformers with the delta rule (DeltaNet) have\nbeen found to be more effective at associative recall, existing algorithms for\ntraining such models do not parallelize over sequence length and are thus\ninefficient to train on modern hardware. This work describes a\nhardware-efficient algorithm for training linear transformers with the delta\nrule, which exploits a memory-efficient representation for computing products\nof Householder matrices. This algorithm allows us to scale up DeltaNet to\nstandard language modeling settings. We train a 1.3B model for 100B tokens and\nfind that it outperforms recent linear-time baselines such as Mamba and GLA in\nterms of perplexity and zero-shot performance on downstream tasks. We also\nexperiment with two hybrid models which combine DeltaNet layers with (1)\nsliding-window attention layers every other layer or (2) two global attention\nlayers, and find that these hybrids outperform strong transformer baselines.\n","authors":["Songlin Yang","Bailin Wang","Yu Zhang","Yikang Shen","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2406.06484v3.pdf","comment":"NeurIPS 2024 camera ready"},{"id":"http://arxiv.org/abs/2408.06423v3","updated":"2024-10-31T18:43:01Z","published":"2024-08-12T18:01:50Z","title":"Evaluating LLMs on Entity Disambiguation in Tables","summary":" Tables are crucial containers of information, but understanding their meaning\nmay be challenging. Over the years, there has been a surge in interest in\ndata-driven approaches based on deep learning that have increasingly been\ncombined with heuristic-based ones. In the last period, the advent of\n\\acf{llms} has led to a new category of approaches for table annotation.\nHowever, these approaches have not been consistently evaluated on a common\nground, making evaluation and comparison difficult. This work proposes an\nextensive evaluation of four STI SOTA approaches: Alligator (formerly s-elbat),\nDagobah, TURL, and TableLlama; the first two belong to the family of\nheuristic-based algorithms, while the others are respectively encoder-only and\ndecoder-only Large Language Models (LLMs). We also include in the evaluation\nboth GPT-4o and GPT-4o-mini, since they excel in various public benchmarks. The\nprimary objective is to measure the ability of these approaches to solve the\nentity disambiguation task with respect to both the performance achieved on a\ncommon-ground evaluation setting and the computational and cost requirements\ninvolved, with the ultimate aim of charting new research paths in the field.\n","authors":["Federico Belotti","Fabio Dadda","Marco Cremaschi","Roberto Avogadro","Matteo Palmonari"],"pdf_url":"https://arxiv.org/pdf/2408.06423v3.pdf","comment":"13 pages, 6 figures; fixed avg. accuracy-over-price plot for GPT\n families, fixed typos in table referencing, added evaluation and inference\n subsubsection"},{"id":"http://arxiv.org/abs/2410.13959v2","updated":"2024-10-31T18:38:37Z","published":"2024-10-17T18:34:43Z","title":"FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven\n Question Answering Pipeline","summary":" Financial decision-making hinges on the analysis of relevant information\nembedded in the enormous volume of documents in the financial domain. To\naddress this challenge, we developed FinQAPT, an end-to-end pipeline that\nstreamlines the identification of relevant financial reports based on a query,\nextracts pertinent context, and leverages Large Language Models (LLMs) to\nperform downstream tasks. To evaluate the pipeline, we experimented with\nvarious techniques to optimize the performance of each module using the FinQA\ndataset. We introduced a novel clustering-based negative sampling technique to\nenhance context extraction and a novel prompting method called Dynamic N-shot\nPrompting to boost the numerical question-answering capabilities of LLMs. At\nthe module level, we achieved state-of-the-art accuracy on FinQA, attaining an\naccuracy of 80.6%. However, at the pipeline level, we observed decreased\nperformance due to challenges in extracting relevant context from financial\nreports. We conducted a detailed error analysis of each module and the\nend-to-end pipeline, pinpointing specific challenges that must be addressed to\ndevelop a robust solution for handling complex financial tasks.\n","authors":["Kuldeep Singh","Simerjot Kaur","Charese Smiley"],"pdf_url":"https://arxiv.org/pdf/2410.13959v2.pdf","comment":"Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.20290v2","updated":"2024-10-31T18:27:13Z","published":"2024-10-26T23:20:48Z","title":"Fast Best-of-N Decoding via Speculative Rejection","summary":" The safe and effective deployment of Large Language Models (LLMs) involves a\ncritical step called alignment, which ensures that the model's responses are in\naccordance with human preferences. Prevalent alignment techniques, such as DPO,\nPPO and their variants, align LLMs by changing the pre-trained model weights\nduring a phase called post-training. While predominant, these post-training\nmethods add substantial complexity before LLMs can be deployed. Inference-time\nalignment methods avoid the complex post-training step and instead bias the\ngeneration towards responses that are aligned with human preferences. The\nbest-known inference-time alignment method, called Best-of-N, is as effective\nas the state-of-the-art post-training procedures. Unfortunately, Best-of-N\nrequires vastly more resources at inference time than standard decoding\nstrategies, which makes it computationally not viable. In this work, we\nintroduce Speculative Rejection, a computationally-viable inference-time\nalignment algorithm. It generates high-scoring responses according to a given\nreward model, like Best-of-N does, while being between 16 to 32 times more\ncomputationally efficient.\n","authors":["Hanshi Sun","Momin Haider","Ruiqi Zhang","Huitao Yang","Jiahao Qiu","Ming Yin","Mengdi Wang","Peter Bartlett","Andrea Zanette"],"pdf_url":"https://arxiv.org/pdf/2410.20290v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.05817v2","updated":"2024-10-31T18:20:32Z","published":"2024-10-08T08:47:11Z","title":"Probing Language Models on Their Knowledge Source","summary":" Large Language Models (LLMs) often encounter conflicts between their learned,\ninternal (parametric knowledge, PK) and external knowledge provided during\ninference (contextual knowledge, CK). Understanding how LLMs models prioritize\none knowledge source over the other remains a challenge. In this paper, we\npropose a novel probing framework to explore the mechanisms governing the\nselection between PK and CK in LLMs. Using controlled prompts designed to\ncontradict the model's PK, we demonstrate that specific model activations are\nindicative of the knowledge source employed. We evaluate this framework on\nvarious LLMs of different sizes and demonstrate that mid-layer activations,\nparticularly those related to relations in the input, are crucial in predicting\nknowledge source selection, paving the way for more reliable models capable of\nhandling knowledge conflicts effectively.\n","authors":["Zineddine Tighidet","Andrea Mogini","Jiali Mei","Benjamin Piwowarski","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2410.05817v2.pdf","comment":"Accepted at BlackBoxNLP@EMNLP2024"},{"id":"http://arxiv.org/abs/2407.06192v2","updated":"2024-10-31T18:16:38Z","published":"2024-07-08T17:59:57Z","title":"Multi-Object Hallucination in Vision-Language Models","summary":" Large vision language models (LVLMs) often suffer from object hallucination,\nproducing objects not present in the given images. While current benchmarks for\nobject hallucination primarily concentrate on the presence of a single object\nclass rather than individual entities, this work systematically investigates\nmulti-object hallucination, examining how models misperceive (e.g., invent\nnonexistent objects or become distracted) when tasked with focusing on multiple\nobjects simultaneously. We introduce Recognition-based Object Probing\nEvaluation (ROPE), an automated evaluation protocol that considers the\ndistribution of object classes within a single image during testing and uses\nvisual referring prompts to eliminate ambiguity. With comprehensive empirical\nstudies and analysis of potential factors leading to multi-object\nhallucination, we found that (1). LVLMs suffer more hallucinations when\nfocusing on multiple objects compared to a single object. (2). The tested\nobject class distribution affects hallucination behaviors, indicating that\nLVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory\nbehaviors are influenced by data-specific factors, salience and frequency, and\nmodel intrinsic behaviors. We hope to enable LVLMs to recognize and reason\nabout multiple objects that often occur in realistic visual scenes, provide\ninsights, and quantify our progress towards mitigating the issues.\n","authors":["Xuweiyi Chen","Ziqiao Ma","Xuejun Zhang","Sihan Xu","Shengyi Qian","Jianing Yang","David F. Fouhey","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2407.06192v2.pdf","comment":"Accepted to NeurIPS 2024 | Project page:\n https://multi-object-hallucination.github.io/"},{"id":"http://arxiv.org/abs/2410.23277v2","updated":"2024-10-31T18:03:51Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Linjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24218v1","updated":"2024-10-31T17:59:52Z","published":"2024-10-31T17:59:52Z","title":"Teaching Embodied Reinforcement Learning Agents: Informativeness and\n Diversity of Language Use","summary":" In real-world scenarios, it is desirable for embodied agents to have the\nability to leverage human language to gain explicit or implicit knowledge for\nlearning tasks. Despite recent progress, most previous approaches adopt simple\nlow-level instructions as language inputs, which may not reflect natural human\ncommunication. It's not clear how to incorporate rich language use to\nfacilitate task learning. To address this question, this paper studies\ndifferent types of language inputs in facilitating reinforcement learning (RL)\nembodied agents. More specifically, we examine how different levels of language\ninformativeness (i.e., feedback on past behaviors and future guidance) and\ndiversity (i.e., variation of language expressions) impact agent learning and\ninference. Our empirical results based on four RL benchmarks demonstrate that\nagents trained with diverse and informative language feedback can achieve\nenhanced generalization and fast adaptation to new tasks. These findings\nhighlight the pivotal role of language use in teaching embodied agents new\ntasks in an open world. Project website:\nhttps://github.com/sled-group/Teachable_RL\n","authors":["Jiajun Xi","Yinong He","Jianing Yang","Yinpei Dai","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2410.24218v1.pdf","comment":"EMNLP 2024 Main. Project website:\n https://github.com/sled-group/Teachable_RL"},{"id":"http://arxiv.org/abs/2410.24201v1","updated":"2024-10-31T17:55:45Z","published":"2024-10-31T17:55:45Z","title":"P-Masking: Power Law Masking Improves Multi-attribute Controlled\n Generation","summary":" We introduce LingGen, a novel approach for controlled text generation that\noffers precise control over a wide array of linguistic attributes, even as the\nnumber of attributes varies. LingGen employs a dynamic P-MASKING strategy,\nwhich samples masking rates from a power law distribution during training. This\ninnovative approach enables the model to develop robust representations and\nadapt its attribute control capabilities across a variable number of\nattributes, from a single attribute to multiple complex configurations. The\nP-MASKING technique enhances LingGen's ability to manage different levels of\nattribute visibility, resulting in superior performance in multi-attribute\ngeneration tasks. Our experiments demonstrate that LingGen surpasses current\nstate-of-the-art models in both attribute control accuracy and text fluency,\nparticularly excelling in scenarios with varying attribute demands.\nAdditionally, our ablation studies highlight the effectiveness of P-MASKING and\nthe influence of different base language models on performance. These findings\ndemonstrate LingGen's potential for applications requiring precise and\nadaptable control over multiple linguistic attributes in text generation.\n","authors":["Mohamed Elgaar","Hadi Amiri"],"pdf_url":"https://arxiv.org/pdf/2410.24201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24200v1","updated":"2024-10-31T17:55:36Z","published":"2024-10-31T17:55:36Z","title":"Length-Induced Embedding Collapse in Transformer-based Models","summary":" Text embeddings enable various applications, but their performance\ndeteriorates on longer texts. In this paper, we find that the performance\ndegradation is due to a phenomenon called Length Collapse, where longer text\nembeddings collapse into a narrow space. This collapse results in a\ndistributional inconsistency between embeddings of different text lengths,\nultimately hurting the performance of downstream tasks. Theoretically, by\nconsidering the self-attention mechanism inherently functions as a low-pass\nfilter, we prove that long sequences increase the attenuation rate of the\nlow-pass filter effect of the self-attention mechanism. With layers going\ndeeper, excessive low-pass filtering causes the token signals to retain only\ntheir Direct-Current (DC) component, which means the input token feature maps\nwill collapse into a narrow space, especially in long texts. Based on the above\nanalysis, we propose to mitigate the undesirable length collapse limitation by\nintroducing a temperature in softmax(), which achieves a higher low-filter\nattenuation rate. The tuning-free method, called TempScale, can be plugged into\nmultiple transformer-based embedding models. Empirically, we demonstrate that\nTempScale can improve existing embedding models, especially on long text\ninputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text\nEmbedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from\nLongEmbed, which specifically focuses on long context retrieval.\n","authors":["Yuqi Zhou","Sunhao Dai","Zhanshuo Cao","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24199v1","updated":"2024-10-31T17:55:27Z","published":"2024-10-31T17:55:27Z","title":"Multi-Attribute Linguistic Tuning for Controlled Paraphrase Generation","summary":" We present a novel approach to paraphrase generation that enables precise\ncontrol and fine-tuning of 40 linguistic attributes for English. Our model is\nan encoder-decoder architecture that takes as input a source sentence and\ndesired linguistic attributes, and produces paraphrases of the source that\nsatisfy the desired attributes. To guarantee high-quality outputs at inference\ntime, our method is equipped with a quality control mechanism that gradually\nadjusts the embedding of linguistic attributes to find the nearest and most\nattainable configuration of desired attributes for paraphrase generation. We\nevaluate the effectiveness of our method by comparing it to recent controllable\ngeneration models. Experimental results demonstrate that the proposed model\noutperforms baselines in generating paraphrases that satisfy desired linguistic\nattributes.\n","authors":["Mohamed Elgaar","Hadi Amiri"],"pdf_url":"https://arxiv.org/pdf/2410.24199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24190v1","updated":"2024-10-31T17:51:00Z","published":"2024-10-31T17:51:00Z","title":"Hidden Persuaders: LLMs' Political Leaning and Their Influence on Voters","summary":" How could LLMs influence our democracy? We investigate LLMs' political\nleanings and the potential influence of LLMs on voters by conducting multiple\nexperiments in a U.S. presidential election context. Through a voting\nsimulation, we first demonstrate 18 open- and closed-weight LLMs' political\npreference for a Democratic nominee over a Republican nominee. We show how this\nleaning towards the Democratic nominee becomes more pronounced in\ninstruction-tuned models compared to their base versions by analyzing their\nresponses to candidate-policy related questions. We further explore the\npotential impact of LLMs on voter choice by conducting an experiment with 935\nU.S. registered voters. During the experiments, participants interacted with\nLLMs (Claude-3, Llama-3, and GPT-4) over five exchanges. The experiment results\nshow a shift in voter choices towards the Democratic nominee following LLM\ninteraction, widening the voting margin from 0.7% to 4.6%, even though LLMs\nwere not asked to persuade users to support the Democratic nominee during the\ndiscourse. This effect is larger than many previous studies on the\npersuasiveness of political campaigns, which have shown minimal effects in\npresidential elections. Many users also expressed a desire for further\npolitical interaction with LLMs. Which aspects of LLM interactions drove these\nshifts in voter choice requires further study. Lastly, we explore how a safety\nmethod can make LLMs more politically neutral, while leaving some open\nquestions.\n","authors":["Yujin Potter","Shiyang Lai","Junsol Kim","James Evans","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2410.24190v1.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.24177v1","updated":"2024-10-31T17:43:13Z","published":"2024-10-31T17:43:13Z","title":"DC-Spin: A Speaker-invariant Speech Tokenizer for Spoken Language Models","summary":" Spoken language models (SLMs) have gained increasing attention with\nadvancements in text-based, decoder-only language models. SLMs process text and\nspeech, enabling simultaneous speech understanding and generation. This paper\npresents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to\nimprove speech tokenization by bridging audio signals and SLM tokens. DC-Spin\nextracts speaker-invariant tokens rich in phonetic information and resilient to\ninput variations, enhancing zero-shot SLM tasks and speech resynthesis. We\npropose a chunk-wise approach to enable streamable DC-Spin without retraining\nand degradation. Comparisons of tokenization methods (self-supervised and\nneural audio codecs), model scalability, and downstream task proxies show that\ntokens easily modeled by an n-gram LM or aligned with phonemes offer strong\nperformance, providing insights for designing speech tokenizers for SLMs.\n","authors":["Heng-Jui Chang","Hongyu Gong","Changhan Wang","James Glass","Yu-An Chung"],"pdf_url":"https://arxiv.org/pdf/2410.24177v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.24175v1","updated":"2024-10-31T17:42:26Z","published":"2024-10-31T17:42:26Z","title":"Constraint Back-translation Improves Complex Instruction Following of\n Large Language Models","summary":" Large language models (LLMs) struggle to follow instructions with complex\nconstraints in format, length, etc. Following the conventional\ninstruction-tuning practice, previous works conduct post-training on complex\ninstruction-response pairs generated by feeding complex instructions to\nadvanced LLMs. However, even advanced LLMs cannot follow complex instructions\nwell, thus limiting the quality of generated data. In this work, we find that\nexisting datasets inherently contain implicit complex constraints and propose a\nnovel data generation technique, constraint back-translation. Specifically, we\ntake the high-quality instruction-response pairs in existing datasets and only\nadopt advanced LLMs to add complex constraints already met by the responses to\nthe instructions, which naturally reduces costs and data noise. In the\nexperiments, we adopt Llama3-70B-Instruct to back-translate constraints and\ncreate a high-quality complex instruction-response dataset, named CRAB. We\npresent that post-training on CRAB improves multiple backbone LLMs' complex\ninstruction-following ability, evaluated on extensive instruction-following\nbenchmarks. We further find that constraint back-translation also serves as a\nuseful auxiliary training objective in post-training. Our code, data, and\nmodels will be released to facilitate future research.\n","authors":["Yunjia Qi","Hao Peng","Xiaozhi Wang","Bin Xu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2410.24175v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.24174v1","updated":"2024-10-31T17:41:14Z","published":"2024-10-31T17:41:14Z","title":"Novel Architecture for Distributed Travel Data Integration and Service\n Provision Using Microservices","summary":" This paper introduces a microservices architecture for the purpose of\nenhancing the flexibility and performance of an airline reservation system. The\narchitectural design incorporates Redis cache technologies, two different\nmessaging systems (Kafka and RabbitMQ), two types of storages (MongoDB, and\nPostgreSQL). It also introduces authorization techniques, including secure\ncommunication through OAuth2 and JWT which is essential with the management of\nhigh-demand travel services. According to selected indicators, the architecture\nprovides an impressive level of data consistency at 99.5% and a latency of data\npropagation of less than 75 ms allowing rapid and reliable intercommunication\nbetween microservices. A system throughput of 1050 events per second was\nachieved so that the acceptability level was maintained even during peak time.\nRedis caching reduced a 92% cache hit ratio on the database thereby lowering\nthe burden on the database and increasing the speed of response. Further\nimprovement of the systems scalability was done through the use of Docker and\nKubernetes which enabled services to be expanded horizontally to cope with the\nchanges in demand. The error rates were very low, at 0.2% further enhancing the\nefficiency of the system in handling real-time data integration. This approach\nis suggested to meet the specific needs of the airline reservation system. It\nis secure, fast, scalable, all serving to improve the user experience as well\nas the efficiency of operations. The low latency and high data integration\nlevels and prevaiing efficient usage of the resources demonstrates the\narchitecture ability to offer continued support in the ever growing high demand\nsituations.\n","authors":["Biman Barua","M. Shamim Kaiser"],"pdf_url":"https://arxiv.org/pdf/2410.24174v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17005v2","updated":"2024-10-31T17:21:13Z","published":"2024-09-25T15:08:08Z","title":"Models Can and Should Embrace the Communicative Nature of\n Human-Generated Math","summary":" Math is constructed by people for people: just as natural language corpora\nreflect not just propositions but the communicative goals of language users,\nthe math data that models are trained on reflects not just idealized\nmathematical entities but rich communicative intentions. While there are\nimportant advantages to treating math in a purely symbolic manner, we here\nhypothesize that there are benefits to treating math as situated linguistic\ncommunication and that language models are well suited for this goal, in ways\nthat are not fully appreciated. We illustrate these points with two case\nstudies. First, we ran an experiment in which we found that language models\ninterpret the equals sign in a humanlike way -- generating systematically\ndifferent word problems for the same underlying equation arranged in different\nways. Second, we found that language models prefer proofs to be ordered in\nnaturalistic ways, even though other orders would be logically equivalent. We\nadvocate for AI systems that learn from and represent the communicative\nintentions latent in human-generated math.\n","authors":["Sasha Boguraev","Ben Lipkin","Leonie Weissweiler","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2409.17005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24160v1","updated":"2024-10-31T17:19:03Z","published":"2024-10-31T17:19:03Z","title":"Redefining in Dictionary: Towards a Enhanced Semantic\n Understanding of Creative Generation","summary":" Creativity, both in human and diffusion models, remains an inherently\nabstract concept; thus, simply adding \"creative\" to a prompt does not yield\nreliable semantic recognition by the model. In this work, we concretize the\nabstract notion of \"creative\" through the TP2O task, which aims to merge two\nunrelated concepts, and introduce CreTok, redefining \"creative\" as the token\n$\\texttt{}$. This redefinition offers a more concrete and universally\nadaptable representation for concept blending. This redefinition occurs\ncontinuously, involving the repeated random sampling of text pairs with\ndifferent concepts and optimizing cosine similarity between target and constant\nprompts. This approach enables $\\texttt{}$ to learn a method for\ncreative concept fusion. Extensive experiments demonstrate that the creative\ncapability enabled by $\\texttt{}$ substantially surpasses recent SOTA\ndiffusion models and achieves superior creative generation. CreTok exhibits\ngreater flexibility and reduced time overhead, as $\\texttt{}$ can\nfunction as a universal token for any concept, facilitating creative generation\nwithout retraining.\n","authors":["Fu Feng","Yucheng Xie","Jing Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2410.24160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24159v1","updated":"2024-10-31T17:18:11Z","published":"2024-10-31T17:18:11Z","title":"GPT or BERT: why not both?","summary":" We present a simple way to merge masked language modeling with causal\nlanguage modeling. This hybrid training objective results in a model that\ncombines the strengths of both modeling paradigms within a single transformer\nstack: GPT-BERT can be transparently used like any standard causal or masked\nlanguage model. We test the pretraining process that enables this flexible\nbehavior on the BabyLM Challenge 2024. The results show that the hybrid\npretraining outperforms masked-only or causal-only models. We openly release\nthe models, training corpora and code.\n","authors":["Lucas Georges Gabriel Charpentier","David Samuel"],"pdf_url":"https://arxiv.org/pdf/2410.24159v1.pdf","comment":"22 pages; submission to the BabyLM Challenge 2024"},{"id":"http://arxiv.org/abs/2410.24155v1","updated":"2024-10-31T17:12:14Z","published":"2024-10-31T17:12:14Z","title":"Thought Space Explorer: Navigating and Expanding Thought Space for Large\n Language Model Reasoning","summary":" Recent advances in large language models (LLMs) have demonstrated their\npotential in handling complex reasoning tasks, which are usually achieved by\nconstructing a thought chain to guide the model to solve the problem with\nmulti-step thinking. However, existing methods often remain confined to\npreviously explored solution spaces and thus overlook the critical blind spot\nwithin LLMs' cognitive range. To address these issues, we design the Thought\nSpace Explorer (TSE), a novel framework to expand and optimize thought\nstructures to guide LLMs to explore their blind spots of thinking. By\ngenerating new reasoning steps and branches based on the original thought\nstructure with various designed strategies, TSE broadens the thought space and\nalleviates the impact of blind spots for LLM reasoning. Experimental results on\nmultiple levels of reasoning tasks demonstrate the efficacy of TSE. We also\nconduct extensive analysis to understand how structured and expansive thought\ncan contribute to unleashing the potential of LLM reasoning capabilities.\n","authors":["Jinghan Zhang","Fengran Mo","Xiting Wang","Kunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.24155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24151v1","updated":"2024-10-31T17:09:55Z","published":"2024-10-31T17:09:55Z","title":"Scaling Concept With Text-Guided Diffusion Models","summary":" Text-guided diffusion models have revolutionized generative tasks by\nproducing high-fidelity content from text descriptions. They have also enabled\nan editing paradigm where concepts can be replaced through text conditioning\n(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of\nreplacing a concept, can we enhance or suppress the concept itself? Through an\nempirical study, we identify a trend where concepts can be decomposed in\ntext-guided diffusion models. Leveraging this insight, we introduce\nScalingConcept, a simple yet effective method to scale decomposed concepts up\nor down in real input without introducing new elements. To systematically\nevaluate our approach, we present the WeakConcept-10 dataset, where concepts\nare imperfect and need to be enhanced. More importantly, ScalingConcept enables\na variety of novel zero-shot applications across image and audio domains,\nincluding tasks such as canonical pose generation and generative sound\nhighlighting or removal.\n","authors":["Chao Huang","Susan Liang","Yunlong Tang","Yapeng Tian","Anurag Kumar","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24151v1.pdf","comment":"Project page: https://wikichao.github.io/ScalingConcept/"},{"id":"http://arxiv.org/abs/2406.17947v2","updated":"2024-10-31T17:08:00Z","published":"2024-06-25T21:47:53Z","title":"Do they mean 'us'? Interpreting Referring Expressions in Intergroup Bias","summary":" The variations between in-group and out-group speech (intergroup bias) are\nsubtle and could underlie many social phenomena like stereotype perpetuation\nand implicit bias. In this paper, we model the intergroup bias as a tagging\ntask on English sports comments from forums dedicated to fandom for NFL teams.\nWe curate a unique dataset of over 6 million game-time comments from opposing\nperspectives (the teams in the game), each comment grounded in a non-linguistic\ndescription of the events that precipitated these comments (live win\nprobabilities for each team). Expert and crowd annotations justify modeling the\nbias through tagging of implicit and explicit referring expressions and reveal\nthe rich, contextual understanding of language and the world required for this\ntask. For large-scale analysis of intergroup variation, we use LLMs for\nautomated tagging, and discover that some LLMs perform best when prompted with\nlinguistic descriptions of the win probability at the time of the comment,\nrather than numerical probability. Further, large-scale tagging of comments\nusing LLMs uncovers linear variations in the form of referent across win\nprobabilities that distinguish in-group and out-group utterances. Code and data\nare available at https://github.com/venkatasg/intergroup-nfl .\n","authors":["Venkata S Govindarajan","Matianyu Zang","Kyle Mahowald","David Beaver","Junyi Jessy Li"],"pdf_url":"https://arxiv.org/pdf/2406.17947v2.pdf","comment":"Accepted to Findings@EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.24140v1","updated":"2024-10-31T17:03:44Z","published":"2024-10-31T17:03:44Z","title":"Don't Touch My Diacritics","summary":" The common practice of preprocessing text before feeding it into NLP models\nintroduces many decision points which have unintended consequences on model\nperformance. In this opinion piece, we focus on the handling of diacritics in\ntexts originating in many languages and scripts. We demonstrate, through\nseveral case studies, the adverse effects of inconsistent encoding of\ndiacritized characters and of removing diacritics altogether. We call on the\ncommunity to adopt simple but necessary steps across all models and toolkits in\norder to improve handling of diacritized text and, by extension, increase\nequity in multilingual NLP.\n","authors":["Kyle Gorman","Yuval Pinter"],"pdf_url":"https://arxiv.org/pdf/2410.24140v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2402.18551v2","updated":"2024-10-31T17:01:45Z","published":"2024-02-28T18:34:53Z","title":"Implicit Optimization Bias of Next-Token Prediction in Linear Models","summary":" We initiate an investigation into the optimization properties of next-token\nprediction (NTP), the dominant training paradigm for modern language models.\nSpecifically, we study the structural properties of the solutions selected by\ngradient-based optimizers among the many possible minimizers of the NTP\nobjective. By framing NTP as cross-entropy minimization across distinct\ncontexts, each tied with a sparse conditional probability distribution across a\nfinite vocabulary of tokens, we introduce \"NTP-separability conditions\" that\nenable reaching the data-entropy lower bound. With this setup, and focusing on\nlinear models with fixed context embeddings, we characterize the optimization\nbias of gradient descent (GD): Within the data subspace defined by the sparsity\npatterns of distinct contexts, GD selects parameters that equate the logits'\ndifferences of in-support tokens to their log-odds. In the orthogonal subspace,\nthe GD parameters diverge in norm and select the direction that maximizes a\nmargin specific to NTP. These findings extend previous research on implicit\nbias in one-hot classification to the NTP setting, highlighting key differences\nand prompting further research into the optimization and generalization\nproperties of NTP, irrespective of the specific architecture used to generate\nthe context embeddings.\n","authors":["Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2402.18551v2.pdf","comment":"v2: fixed typos and writing in various parts; updated figures and\n future-work section"},{"id":"http://arxiv.org/abs/2401.12794v3","updated":"2024-10-31T16:58:51Z","published":"2024-01-23T14:29:17Z","title":"Benchmarking LLMs via Uncertainty Quantification","summary":" The proliferation of open-source Large Language Models (LLMs) from various\ninstitutions has highlighted the urgent need for comprehensive evaluation\nmethods. However, current evaluation platforms, such as the widely recognized\nHuggingFace open LLM leaderboard, neglect a crucial aspect -- uncertainty,\nwhich is vital for thoroughly assessing LLMs. To bridge this gap, we introduce\na new benchmarking approach for LLMs that integrates uncertainty\nquantification. Our examination involves nine LLMs (LLM series) spanning five\nrepresentative natural language processing tasks. Our findings reveal that: I)\nLLMs with higher accuracy may exhibit lower certainty; II) Larger-scale LLMs\nmay display greater uncertainty compared to their smaller counterparts; and\nIII) Instruction-finetuning tends to increase the uncertainty of LLMs. These\nresults underscore the significance of incorporating uncertainty in the\nevaluation of LLMs.\n","authors":["Fanghua Ye","Mingming Yang","Jianhui Pang","Longyue Wang","Derek F. Wong","Emine Yilmaz","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2401.12794v3.pdf","comment":"30 pages, accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.12306v2","updated":"2024-10-31T16:49:59Z","published":"2024-09-18T20:33:54Z","title":"Measuring Sound Symbolism in Audio-visual Models","summary":" Audio-visual pre-trained models have gained substantial attention recently\nand demonstrated superior performance on various audio-visual tasks. This study\ninvestigates whether pre-trained audio-visual models demonstrate non-arbitrary\nassociations between sounds and visual representations$\\unicode{x2013}$known as\nsound symbolism$\\unicode{x2013}$which is also observed in humans. We developed\na specialized dataset with synthesized images and audio samples and assessed\nthese models using a non-parametric approach in a zero-shot setting. Our\nfindings reveal a significant correlation between the models' outputs and\nestablished patterns of sound symbolism, particularly in models trained on\nspeech data. These results suggest that such models can capture sound-meaning\nconnections akin to human language processing, providing insights into both\ncognitive architectures and machine learning strategies.\n","authors":["Wei-Cheng Tseng","Yi-Jen Shih","David Harwath","Raymond Mooney"],"pdf_url":"https://arxiv.org/pdf/2409.12306v2.pdf","comment":"Errors in the introduction part that might potentially affect the\n integrity of the paper. Withdraw at the point. Will replace with an updated\n version in the future"},{"id":"http://arxiv.org/abs/2406.04823v2","updated":"2024-10-31T16:48:51Z","published":"2024-06-07T10:48:45Z","title":"BERTs are Generative In-Context Learners","summary":" While in-context learning is commonly associated with causal language models,\nsuch as GPT, we demonstrate that this capability also 'emerges' in masked\nlanguage models. Through an embarrassingly simple inference technique, we\nenable an existing masked model, DeBERTa, to perform generative tasks without\nadditional training or architectural changes. Our evaluation reveals that the\nmasked and causal language models behave very differently, as they clearly\noutperform each other on different categories of tasks. These complementary\nstrengths suggest that the field's focus on causal models for in-context\nlearning may be limiting - both architectures can develop these capabilities,\nbut with distinct advantages; pointing toward promising hybrid approaches that\ncombine the strengths of both objectives.\n","authors":["David Samuel"],"pdf_url":"https://arxiv.org/pdf/2406.04823v2.pdf","comment":"26 pages, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24114v1","updated":"2024-10-31T16:44:10Z","published":"2024-10-31T16:44:10Z","title":"Nearest Neighbor Normalization Improves Multimodal Retrieval","summary":" Multimodal models leverage large-scale pre-training to achieve strong but\nstill imperfect performance on tasks such as image captioning, visual question\nanswering, and cross-modal retrieval. In this paper, we present a simple and\nefficient method for correcting errors in trained contrastive image-text\nretrieval models with no additional training, called Nearest Neighbor\nNormalization (NNN). We show an improvement on retrieval metrics in both text\nretrieval and image retrieval for all of the contrastive models that we tested\n(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used\n(MS-COCO and Flickr30k). NNN requires a reference database, but does not\nrequire any training on this database, and can even increase the retrieval\naccuracy of a model after finetuning.\n","authors":["Neil Chowdhury","Franklin Wang","Sumedh Shenoy","Douwe Kiela","Sarah Schwettmann","Tristan Thrush"],"pdf_url":"https://arxiv.org/pdf/2410.24114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24087v1","updated":"2024-10-31T16:20:04Z","published":"2024-10-31T16:20:04Z","title":"In-Context Fine-Tuning for Time-Series Foundation Models","summary":" Motivated by the recent success of time-series foundation models for\nzero-shot forecasting, we present a methodology for $\\textit{in-context\nfine-tuning}$ of a time-series foundation model. In particular, we design a\npretrained foundation model that can be prompted (at inference time) with\nmultiple time-series examples, in order to forecast a target time-series into\nthe future. Our foundation model is specifically trained to utilize examples\nfrom multiple related time-series in its context window (in addition to the\nhistory of the target time-series) to help it adapt to the specific\ndistribution of the target domain at inference time. We show that such a\nfoundation model that uses in-context examples at inference time can obtain\nmuch better performance on popular forecasting benchmarks compared to\nsupervised deep learning methods, statistical models, as well as other\ntime-series foundation models. Interestingly, our in-context fine-tuning\napproach even rivals the performance of a foundation model that is explicitly\nfine-tuned on the target domain.\n","authors":["Abhimanyu Das","Matthew Faw","Rajat Sen","Yichen Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.24087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19238v2","updated":"2024-10-31T16:06:22Z","published":"2024-06-27T15:01:53Z","title":"Revealing Fine-Grained Values and Opinions in Large Language Models","summary":" Uncovering latent values and opinions embedded in large language models\n(LLMs) can help identify biases and mitigate potential harm. Recently, this has\nbeen approached by prompting LLMs with survey questions and quantifying the\nstances in the outputs towards morally and politically charged statements.\nHowever, the stances generated by LLMs can vary greatly depending on how they\nare prompted, and there are many ways to argue for or against a given position.\nIn this work, we propose to address this by analysing a large and robust\ndataset of 156k LLM responses to the 62 propositions of the Political Compass\nTest (PCT) generated by 6 LLMs using 420 prompt variations. We perform\ncoarse-grained analysis of their generated stances and fine-grained analysis of\nthe plain text justifications for those stances. For fine-grained analysis, we\npropose to identify tropes in the responses: semantically similar phrases that\nare recurrent and consistent across different prompts, revealing natural\npatterns in the text that a given LLM is prone to produce. We find that\ndemographic features added to prompts significantly affect outcomes on the PCT,\nreflecting bias, as well as disparities between the results of tests when\neliciting closed-form vs. open domain responses. Additionally, patterns in the\nplain text rationales via tropes show that similar justifications are\nrepeatedly generated across models and prompts even with disparate stances.\n","authors":["Dustin Wright","Arnav Arora","Nadav Borenstein","Srishti Yadav","Serge Belongie","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2406.19238v2.pdf","comment":"Findings of EMNLP 2024; 28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.05977v2","updated":"2024-10-31T16:01:59Z","published":"2024-09-09T18:21:28Z","title":"Mathematical Formalized Problem Solving and Theorem Proving in Different\n Fields in Lean 4","summary":" Using computerized verifiable formal languages like Lean 4 to prove\nmathematical theorems has a significant impact on mathematical formalization.\nLean 4 offers prominent potential for advancing mathematical reasoning.\nHowever, existing efforts are limited to mathematical formalization languages\nin substantial online corpora and are dedicated to keeping pace with rapidly\nevolving languages. To bridge the gap between the traditional and computerized\nproof, my approach to formalizing theorem proving involves generating formal\nsteps and complete proofs using Large Language Models (LLMs) based on Natural\nLanguage (NL) proofs. The method is to introduce the basic structure and\ntactics in general, determine how AI can assist the mathematical formalization\nprocess to improve its performance, and give examples of solving problems in\nLean 4 comparing to NL, mainly in IMO, and a sample theorem proving in abstract\nalgebra.\n","authors":["Xichen Tang"],"pdf_url":"https://arxiv.org/pdf/2409.05977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02119v3","updated":"2024-10-31T15:57:42Z","published":"2023-12-04T18:49:23Z","title":"Tree of Attacks: Jailbreaking Black-Box LLMs Automatically","summary":" While Large Language Models (LLMs) display versatile functionality, they\ncontinue to generate harmful, biased, and toxic content, as demonstrated by the\nprevalence of human-designed jailbreaks. In this work, we present Tree of\nAttacks with Pruning (TAP), an automated method for generating jailbreaks that\nonly requires black-box access to the target LLM. TAP utilizes an attacker LLM\nto iteratively refine candidate (attack) prompts until one of the refined\nprompts jailbreaks the target. In addition, before sending prompts to the\ntarget, TAP assesses them and prunes the ones unlikely to result in jailbreaks,\nreducing the number of queries sent to the target LLM. In empirical\nevaluations, we observe that TAP generates prompts that jailbreak\nstate-of-the-art LLMs (including GPT4-Turbo and GPT4o) for more than 80% of the\nprompts. This significantly improves upon the previous state-of-the-art\nblack-box methods for generating jailbreaks while using a smaller number of\nqueries than them. Furthermore, TAP is also capable of jailbreaking LLMs\nprotected by state-of-the-art guardrails, e.g., LlamaGuard.\n","authors":["Anay Mehrotra","Manolis Zampetakis","Paul Kassianik","Blaine Nelson","Hyrum Anderson","Yaron Singer","Amin Karbasi"],"pdf_url":"https://arxiv.org/pdf/2312.02119v3.pdf","comment":"Accepted for presentation at NeurIPS 2024. Code:\n https://github.com/RICommunity/TAP"},{"id":"http://arxiv.org/abs/2402.01093v2","updated":"2024-10-31T15:56:08Z","published":"2024-02-02T01:45:18Z","title":"Need a Small Specialized Language Model? Plan Early!","summary":" Large language models are versatile tools but are not suitable for small\ninference budgets. Small models have more efficient inference, but their lower\ncapacity means that their performance can be good only if one limits their\nscope to a specialized domain. This paper explores how to get good specialized\nsmall language models using a large, generic, pretraining set and a limited\namount of specialized data. We consider two scenarios, depending on whether (i)\none can afford pretraining a model for each specialization task, or (ii) one\nwants to cheaply adapt a single pretrained model for each task. In the first\nscenario, we propose an effective solution based on importance sampling: we\nresample the pretraining set to imitate the specialization data and train a\nsmall model on it. In the second scenario, we propose a novel architecture,\nprojected networks (PN). PN is a large network whose parameters can be linearly\nprojected into a small network for specialization. For both scenarios, we\ndemonstrate the empirical effectiveness of our solutions across various\ndomains, training set sizes, and training budgets.\n","authors":["David Grangier","Angelos Katharopoulos","Pierre Ablin","Awni Hannun"],"pdf_url":"https://arxiv.org/pdf/2402.01093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24049v1","updated":"2024-10-31T15:45:23Z","published":"2024-10-31T15:45:23Z","title":"Desert Camels and Oil Sheikhs: Arab-Centric Red Teaming of Frontier LLMs","summary":" Large language models (LLMs) are widely used but raise ethical concerns due\nto embedded social biases. This study examines LLM biases against Arabs versus\nWesterners across eight domains, including women's rights, terrorism, and\nanti-Semitism and assesses model resistance to perpetuating these biases. To\nthis end, we create two datasets: one to evaluate LLM bias toward Arabs versus\nWesterners and another to test model safety against prompts that exaggerate\nnegative traits (\"jailbreaks\"). We evaluate six LLMs -- GPT-4, GPT-4o, LlaMA\n3.1 (8B & 405B), Mistral 7B, and Claude 3.5 Sonnet. We find 79% of cases\ndisplaying negative biases toward Arabs, with LlaMA 3.1-405B being the most\nbiased. Our jailbreak tests reveal GPT-4o as the most vulnerable, despite being\nan optimized version, followed by LlaMA 3.1-8B and Mistral 7B. All LLMs except\nClaude exhibit attack success rates above 87% in three categories. We also find\nClaude 3.5 Sonnet the safest, but it still displays biases in seven of eight\ncategories. Despite being an optimized version of GPT4, We find GPT-4o to be\nmore prone to biases and jailbreaks, suggesting optimization flaws. Our\nfindings underscore the pressing need for more robust bias mitigation\nstrategies and strengthened security measures in LLMs.\n","authors":["Muhammed Saeed","Elgizouli Mohamed","Mukhtar Mohamed","Shaina Raza","Shady Shehata","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2410.24049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24032v1","updated":"2024-10-31T15:30:55Z","published":"2024-10-31T15:30:55Z","title":"Navigating the Unknown: A Chat-Based Collaborative Interface for\n Personalized Exploratory Tasks","summary":" The rise of large language models (LLMs) has revolutionized user interactions\nwith knowledge-based systems, enabling chatbots to synthesize vast amounts of\ninformation and assist with complex, exploratory tasks. However, LLM-based\nchatbots often struggle to provide personalized support, particularly when\nusers start with vague queries or lack sufficient contextual information. This\npaper introduces the Collaborative Assistant for Personalized Exploration\n(CARE), a system designed to enhance personalization in exploratory tasks by\ncombining a multi-agent LLM framework with a structured user interface. CARE's\ninterface consists of a Chat Panel, Solution Panel, and Needs Panel, enabling\niterative query refinement and dynamic solution generation. The multi-agent\nframework collaborates to identify both explicit and implicit user needs,\ndelivering tailored, actionable solutions. In a within-subject user study with\n22 participants, CARE was consistently preferred over a baseline LLM chatbot,\nwith users praising its ability to reduce cognitive load, inspire creativity,\nand provide more tailored solutions. Our findings highlight CARE's potential to\ntransform LLM-based systems from passive information retrievers to proactive\npartners in personalized problem-solving and exploration.\n","authors":["Yingzhe Peng","Xiaoting Qin","Zhiyang Zhang","Jue Zhang","Qingwei Lin","Xu Yang","Dongmei Zhang","Saravan Rajmohan","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24029v1","updated":"2024-10-31T15:28:26Z","published":"2024-10-31T15:28:26Z","title":"Joint Training for Selective Prediction","summary":" Classifier models are prevalent in natural language processing (NLP), often\nwith high accuracy. Yet in real world settings, human-in-the-loop systems can\nfoster trust in model outputs and even higher performance. Selective Prediction\n(SP) methods determine when to adopt a classifier's output versus defer to a\nhuman. Previous SP approaches have addressed how to improve softmax as a\nmeasure of model confidence, or have developed separate confidence estimators.\nOne previous method involves learning a deferral model based on engineered\nfeatures. We introduce a novel joint-training approach that simultaneously\noptimizes learned representations used by the classifier module and a learned\ndeferral policy. Our results on four classification tasks demonstrate that\njoint training not only leads to better SP outcomes over two strong baselines,\nbut also improves the performance of both modules.\n","authors":["Zhaohui Li","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2410.24029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24022v1","updated":"2024-10-31T15:22:03Z","published":"2024-10-31T15:22:03Z","title":"SFM-Protein: Integrative Co-evolutionary Pre-training for Advanced\n Protein Sequence Representation","summary":" Proteins, essential to biological systems, perform functions intricately\nlinked to their three-dimensional structures. Understanding the relationship\nbetween protein structures and their amino acid sequences remains a core\nchallenge in protein modeling. While traditional protein foundation models\nbenefit from pre-training on vast unlabeled datasets, they often struggle to\ncapture critical co-evolutionary information, which evolutionary-based methods\nexcel at. In this study, we introduce a novel pre-training strategy for protein\nfoundation models that emphasizes the interactions among amino acid residues to\nenhance the extraction of both short-range and long-range co-evolutionary\nfeatures from sequence data. Trained on a large-scale protein sequence dataset,\nour model demonstrates superior generalization ability, outperforming\nestablished baselines of similar size, including the ESM model, across diverse\ndownstream tasks. Experimental results confirm the model's effectiveness in\nintegrating co-evolutionary information, marking a significant step forward in\nprotein sequence-based modeling.\n","authors":["Liang He","Peiran Jin","Yaosen Min","Shufang Xie","Lijun Wu","Tao Qin","Xiaozhuan Liang","Kaiyuan Gao","Yuliang Jiang","Tie-Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2410.24022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24021v1","updated":"2024-10-31T15:21:27Z","published":"2024-10-31T15:21:27Z","title":"Detecting text level intellectual influence with knowledge graph\n embeddings","summary":" Introduction: Tracing the spread of ideas and the presence of influence is a\nquestion of special importance across a wide range of disciplines, ranging from\nintellectual history to cultural analytics, computational social science, and\nthe science of science.\n Method: We collect a corpus of open source journal articles, generate\nKnowledge Graph representations using the Gemini LLM, and attempt to predict\nthe existence of citations between sampled pairs of articles using previously\npublished methods and a novel Graph Neural Network based embedding model.\n Results: We demonstrate that our knowledge graph embedding method is superior\nat distinguishing pairs of articles with and without citation. Once trained, it\nruns efficiently and can be fine-tuned on specific corpora to suit individual\nresearcher needs.\n Conclusion(s): This experiment demonstrates that the relationships encoded in\na knowledge graph, especially the types of concepts brought together by\nspecific relations can encode information capable of revealing intellectual\ninfluence. This suggests that further work in analyzing document level\nknowledge graphs to understand latent structures could provide valuable\ninsights.\n","authors":["Lucian Li","Eryclis Silva"],"pdf_url":"https://arxiv.org/pdf/2410.24021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24019v1","updated":"2024-10-31T15:20:50Z","published":"2024-10-31T15:20:50Z","title":"Speech is More Than Words: Do Speech-to-Text Translation Systems\n Leverage Prosody?","summary":" The prosody of a spoken utterance, including features like stress, intonation\nand rhythm, can significantly affect the underlying semantics, and as a\nconsequence can also affect its textual translation. Nevertheless, prosody is\nrarely studied within the context of speech-to-text translation (S2TT) systems.\nIn particular, end-to-end (E2E) systems have been proposed as well-suited for\nprosody-aware translation because they have direct access to the speech signal\nwhen making translation decisions, but the understanding of whether this is\nsuccessful in practice is still limited. A main challenge is the difficulty of\nevaluating prosody awareness in translation. To address this challenge, we\nintroduce an evaluation methodology and a focused benchmark (named ContraProST)\naimed at capturing a wide range of prosodic phenomena. Our methodology uses\nlarge language models and controllable text-to-speech (TTS) to generate\ncontrastive examples. Through experiments in translating English speech into\nGerman, Spanish, and Japanese, we find that (a) S2TT models possess some\ninternal representation of prosody, but the prosody signal is often not strong\nenough to affect the translations, (b) E2E systems outperform cascades of\nspeech recognition and text translation systems, confirming their theoretical\nadvantage in this regard, and (c) certain cascaded systems also capture\nprosodic information in the translation, but only to a lesser extent that\ndepends on the particulars of the transcript's surface form.\n","authors":["Ioannis Tsiamas","Matthias Sperber","Andrew Finch","Sarthak Garg"],"pdf_url":"https://arxiv.org/pdf/2410.24019v1.pdf","comment":"WMT 2024"},{"id":"http://arxiv.org/abs/2409.17692v2","updated":"2024-10-31T14:38:27Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v2.pdf","comment":"Technical Report. Codes and models are available in\n https://github.com/MIO-Team/MIO"},{"id":"http://arxiv.org/abs/2405.19534v4","updated":"2024-10-31T14:32:28Z","published":"2024-05-29T21:29:44Z","title":"Preference Learning Algorithms Do Not Learn Preference Rankings","summary":" Preference learning algorithms (e.g., RLHF and DPO) are frequently used to\nsteer LLMs to produce generations that are more preferred by humans, but our\nunderstanding of their inner workings is still limited. In this work, we study\nthe conventional wisdom that preference learning trains models to assign higher\nlikelihoods to more preferred outputs than less preferred outputs, measured via\nranking accuracy. Surprisingly, we find that most state-of-the-art\npreference-tuned models achieve a ranking accuracy of less than 60% on common\npreference datasets. We furthermore derive the idealized ranking accuracy that\na preference-tuned LLM would achieve if it optimized the DPO or RLHF objective\nperfectly. We demonstrate that existing models exhibit a significant alignment\ngap -- i.e., a gap between the observed and idealized ranking accuracies. We\nattribute this discrepancy to the DPO objective, which is empirically and\ntheoretically ill-suited to fix even mild ranking errors in the reference\nmodel, and derive a simple and efficient formula for quantifying the difficulty\nof learning a given preference datapoint. Finally, we demonstrate that ranking\naccuracy strongly correlates with the empirically popular win rate metric when\nthe model is close to the reference model used in the objective, shedding\nfurther light on the differences between on-policy (e.g., RLHF) and off-policy\n(e.g., DPO) preference learning algorithms.\n","authors":["Angelica Chen","Sadhika Malladi","Lily H. Zhang","Xinyi Chen","Qiuyi Zhang","Rajesh Ranganath","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2405.19534v4.pdf","comment":"NeurIPS 2024 camera-ready"},{"id":"http://arxiv.org/abs/2407.00114v2","updated":"2024-10-31T14:27:50Z","published":"2024-06-27T13:46:11Z","title":"OmniJARVIS: Unified Vision-Language-Action Tokenization Enables\n Open-World Instruction Following Agents","summary":" This paper presents OmniJARVIS, a novel Vision-Language-Action (VLA) model\nfor open-world instruction-following agents in Minecraft. Compared to prior\nworks that either emit textual goals to separate controllers or produce the\ncontrol command directly, OmniJARVIS seeks a different path to ensure both\nstrong reasoning and efficient decision-making capabilities via unified\ntokenization of multimodal interaction data. First, we introduce a\nself-supervised approach to learn a behavior encoder that produces discretized\ntokens for behavior trajectories $\\tau = \\{o_0, a_0, \\dots\\}$ and an imitation\nlearning policy decoder conditioned on these tokens. These additional behavior\ntokens will be augmented to the vocabulary of pretrained Multimodal Language\nModels. With this encoder, we then pack long-term multimodal interactions\ninvolving task instructions, memories, thoughts, observations, textual\nresponses, behavior trajectories, etc into unified token sequences and model\nthem with autoregressive transformers. Thanks to the semantically meaningful\nbehavior tokens, the resulting VLA model, OmniJARVIS, can reason (by producing\nchain-of-thoughts), plan, answer questions, and act (by producing behavior\ntokens for the imitation learning policy decoder). OmniJARVIS demonstrates\nexcellent performances on a comprehensive collection of atomic, programmatic,\nand open-ended tasks in open-world Minecraft. Our analysis further unveils the\ncrucial design principles in interaction data formation, unified tokenization,\nand its scaling potentials. The dataset, models, and code will be released at\nhttps://craftjarvis.org/OmniJARVIS.\n","authors":["Zihao Wang","Shaofei Cai","Zhancun Mu","Haowei Lin","Ceyao Zhang","Xuejie Liu","Qing Li","Anji Liu","Xiaojian Ma","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2407.00114v2.pdf","comment":"accepted on NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.14808v2","updated":"2024-10-31T14:19:49Z","published":"2024-05-23T17:18:46Z","title":"Implicit Personalization in Language Models: A Systematic Study","summary":" Implicit Personalization (IP) is a phenomenon of language models inferring a\nuser's background from the implicit cues in the input prompts and tailoring the\nresponse based on this inference. While previous work has touched upon various\ninstances of this problem, there lacks a unified framework to study this\nbehavior. This work systematically studies IP through a rigorous mathematical\nformulation, a multi-perspective moral reasoning framework, and a set of case\nstudies. Our theoretical foundation for IP relies on a structural causal model\nand introduces a novel method, indirect intervention, to estimate the causal\neffect of a mediator variable that cannot be directly intervened upon. Beyond\nthe technical approach, we also introduce a set of moral reasoning principles\nbased on three schools of moral philosophy to study when IP may or may not be\nethically appropriate. Equipped with both mathematical and ethical insights, we\npresent three diverse case studies illustrating the varied nature of the IP\nproblem and offer recommendations for future research. Our code is at\nhttps://github.com/jiarui-liu/IP, and our data is at\nhttps://huggingface.co/datasets/Jerry999/ImplicitPersonalizationData.\n","authors":["Zhijing Jin","Nils Heil","Jiarui Liu","Shehzaad Dhuliawala","Yahang Qi","Bernhard Schölkopf","Rada Mihalcea","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2405.14808v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.10476v2","updated":"2024-10-31T14:15:49Z","published":"2024-10-14T13:10:45Z","title":"Will LLMs Replace the Encoder-Only Models in Temporal Relation\n Classification?","summary":" The automatic detection of temporal relations among events has been mainly\ninvestigated with encoder-only models such as RoBERTa. Large Language Models\n(LLM) have recently shown promising performance in temporal reasoning tasks\nsuch as temporal question answering. Nevertheless, recent studies have tested\nthe LLMs' performance in detecting temporal relations of closed-source models\nonly, limiting the interpretability of those results. In this work, we\ninvestigate LLMs' performance and decision process in the Temporal Relation\nClassification task. First, we assess the performance of seven open and\nclosed-sourced LLMs experimenting with in-context learning and lightweight\nfine-tuning approaches. Results show that LLMs with in-context learning\nsignificantly underperform smaller encoder-only models based on RoBERTa. Then,\nwe delve into the possible reasons for this gap by applying explainable\nmethods. The outcome suggests a limitation of LLMs in this task due to their\nautoregressive nature, which causes them to focus only on the last part of the\nsequence. Additionally, we evaluate the word embeddings of these two models to\nbetter understand their pre-training differences. The code and the fine-tuned\nmodels can be found respectively on GitHub.\n","authors":["Gabriel Roccabruna","Massimo Rizzoli","Giuseppe Riccardi"],"pdf_url":"https://arxiv.org/pdf/2410.10476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23956v1","updated":"2024-10-31T14:09:50Z","published":"2024-10-31T14:09:50Z","title":"Multilingual Pretraining Using a Large Corpus Machine-Translated from a\n Single Source Language","summary":" English, as a very high-resource language, enables the pretraining of\nhigh-quality large language models (LLMs). The same cannot be said for most\nother languages, as leading LLMs still underperform for non-English languages,\nlikely due to a gap in the quality and diversity of the available multilingual\npretraining corpora. In this work, we find that machine-translated text from a\nsingle high-quality source language can contribute significantly to the\npretraining of multilingual LLMs. We translate FineWeb-Edu, a high-quality\nEnglish web dataset, into French, German, and Spanish, resulting in a final\n300B-token dataset, which we call TransWeb-Edu, and pretrain a 1.3B-parameter\nmodel, CuatroLLM, from scratch on this dataset. Across five non-English\nreasoning tasks, we show that CuatroLLM matches or outperforms state-of-the-art\nmultilingual models trained using closed data, such as Llama3.2 and Gemma2,\ndespite using an order of magnitude less data, such as about 6% of the tokens\nused for Llama3.2's training. We further demonstrate that with additional\ndomain-specific pretraining, amounting to less than 1% of TransWeb-Edu,\nCuatroLLM surpasses the state of the art in multilingual reasoning. To promote\nreproducibility, we release our corpus, models, and training pipeline under\nopen licenses at hf.co/britllm/CuatroLLM.\n","authors":["Jiayi Wang","Yao Lu","Maurice Weber","Max Ryabinin","Yihong Chen","Raphael Tang","Pontus Stenetorp"],"pdf_url":"https://arxiv.org/pdf/2410.23956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23953v1","updated":"2024-10-31T14:07:26Z","published":"2024-10-31T14:07:26Z","title":"Representative Social Choice: From Learning Theory to AI Alignment","summary":" Social choice theory is the study of preference aggregation across a\npopulation, used both in mechanism design for human agents and in the\ndemocratic alignment of language models. In this study, we propose the\nrepresentative social choice framework for the modeling of democratic\nrepresentation in collective decisions, where the number of issues and\nindividuals are too large for mechanisms to consider all preferences directly.\nThese scenarios are widespread in real-world decision-making processes, such as\njury trials, indirect elections, legislation processes, corporate governance,\nand, more recently, language model alignment. In representative social choice,\nthe population is represented by a finite sample of individual-issue pairs\nbased on which social choice decisions are made. We show that many of the\ndeepest questions in representative social choice can be naturally formulated\nas statistical learning problems, and prove the generalization properties of\nsocial choice mechanisms using the theory of machine learning. We further\nformulate axioms for representative social choice, and prove Arrow-like\nimpossibility theorems with new combinatorial tools of analysis. Our framework\nintroduces the representative approach to social choice, opening up research\ndirections at the intersection of social choice, learning theory, and AI\nalignment.\n","authors":["Tianyi Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.23953v1.pdf","comment":"Full version (20 pages). Under review. An excerpt was previously\n accepted to NeurIPS 2024 Pluralistic Alignment Workshop"},{"id":"http://arxiv.org/abs/2407.16724v2","updated":"2024-10-31T14:03:06Z","published":"2024-07-23T12:38:48Z","title":"Structure-aware Domain Knowledge Injection for Large Language Models","summary":" This paper introduces a pioneering methodology, termed StructTuning, to\nefficiently transform foundation Large Language Models (LLMs) into domain\nspecialists. It significantly reduces the training corpus requirement to a mere\n0.3%, while achieving an impressive 50% of traditional knowledge injection\nperformance. Our method is inspired by the educational processes of human\nstudents, particularly how structured domain knowledge from textbooks is\nassimilated and subsequently applied to tackle real-world challenges through\nspecific exercises. Based on this, we propose a novel two-stage strategy for\nknowledge injection and alignment: Structure-aware Continual Pre-Training\n(SCPT) and Structure-aware Supervised Fine-Tuning (SSFT). In the SCPT phase, we\nautomatically extract the domain knowledge taxonomy and reorganize the training\ncorpora, enabling LLMs to effectively link textual segments to targeted\nknowledge points within the taxonomy. In the SSFT phase, we explicitly prompt\nmodels to elucidate the underlying knowledge structure in their outputs,\nleveraging the structured domain insight to address practical problems. Our\nultimate method has undergone extensive evaluations across model architectures\nand scales, using closed-book question-answering tasks on LongBench and\nMMedBench datasets. Remarkably, our method demonstrates the potential of\ncomparable improvement against the state-of-the-art MMedLM2 on MMedBench, while\nsignificantly reducing the training costs to 5%. This breakthrough paves the\nway for scaling up our StructTuning for stronger domain-specific LLMs with\ncomprehensive data utilization. Code is available at\nhttps://github.com/alibaba/struxgpt.\n","authors":["Kai Liu","Ze Chen","Zhihang Fu","Rongxin Jiang","Fan Zhou","Yaowu Chen","Yue Wu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2407.16724v2.pdf","comment":"Preprint. Code is available at https://github.com/alibaba/struxgpt"},{"id":"http://arxiv.org/abs/2406.19073v2","updated":"2024-10-31T13:59:05Z","published":"2024-06-27T10:43:04Z","title":"AMBROSIA: A Benchmark for Parsing Ambiguous Questions into Database\n Queries","summary":" Practical semantic parsers are expected to understand user utterances and map\nthem to executable programs, even when these are ambiguous. We introduce a new\nbenchmark, AMBROSIA, which we hope will inform and inspire the development of\ntext-to-SQL parsers capable of recognizing and interpreting ambiguous requests.\nOur dataset contains questions showcasing three different types of ambiguity\n(scope ambiguity, attachment ambiguity, and vagueness), their interpretations,\nand corresponding SQL queries. In each case, the ambiguity persists even when\nthe database context is provided. This is achieved through a novel approach\nthat involves controlled generation of databases from scratch. We benchmark\nvarious LLMs on AMBROSIA, revealing that even the most advanced models struggle\nto identify and interpret ambiguity in questions.\n","authors":["Irina Saparina","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2406.19073v2.pdf","comment":"NeurIPS 2024 D&B Track Spotlight"},{"id":"http://arxiv.org/abs/2409.07146v2","updated":"2024-10-31T13:54:35Z","published":"2024-09-11T09:49:50Z","title":"Gated Slot Attention for Efficient Linear-Time Sequence Modeling","summary":" Linear attention Transformers and their gated variants, celebrated for\nenabling parallel training and efficient recurrent inference, still fall short\nin recall-intensive tasks compared to traditional Transformers and demand\nsignificant resources for training from scratch. This paper introduces Gated\nSlot Attention (GSA), which enhances Attention with Bounded-memory-Control\n(ABC) by incorporating a gating mechanism inspired by Gated Linear Attention\n(GLA). Essentially, GSA comprises a two-layer GLA linked via\n$\\operatorname{softmax}$, utilizing context-aware memory reading and adaptive\nforgetting to improve memory capacity while maintaining compact recurrent state\nsize. This design greatly enhances both training and inference efficiency\nthrough GLA's hardware-efficient training algorithm and reduced state size.\nAdditionally, retaining the $\\operatorname{softmax}$ operation is particularly\nbeneficial in \"finetuning pretrained Transformers to RNNs\" (T2R) settings,\nreducing the need for extensive training from scratch. Extensive experiments\nconfirm GSA's superior performance in scenarios requiring in-context recall and\nin T2R settings.\n","authors":["Yu Zhang","Songlin Yang","Ruijie Zhu","Yue Zhang","Leyang Cui","Yiqiao Wang","Bolun Wang","Freda Shi","Bailin Wang","Wei Bi","Peng Zhou","Guohong Fu"],"pdf_url":"https://arxiv.org/pdf/2409.07146v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.22086v2","updated":"2024-10-31T13:50:02Z","published":"2024-10-29T14:41:44Z","title":"Unlearning as multi-task optimization: A normalized gradient difference\n approach with an adaptive learning rate","summary":" Machine unlearning has been used to remove unwanted knowledge acquired by\nlarge language models (LLMs). In this paper, we examine machine unlearning from\nan optimization perspective, framing it as a regularized multi-task\noptimization problem, where one task optimizes a forgetting objective and\nanother optimizes the model performance. In particular, we introduce a\nnormalized gradient difference (NGDiff) algorithm, enabling us to have better\ncontrol over the trade-off between the objectives, while integrating a new,\nautomatic learning rate scheduler. We provide a theoretical analysis and\nempirically demonstrate the superior performance of NGDiff among\nstate-of-the-art unlearning methods on the TOFU and MUSE datasets while\nexhibiting stable training.\n","authors":["Zhiqi Bu","Xiaomeng Jin","Bhanukiran Vinzamuri","Anil Ramakrishna","Kai-Wei Chang","Volkan Cevher","Mingyi Hong"],"pdf_url":"https://arxiv.org/pdf/2410.22086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23933v1","updated":"2024-10-31T13:47:10Z","published":"2024-10-31T13:47:10Z","title":"Language Models can Self-Lengthen to Generate Long Texts","summary":" Recent advancements in Large Language Models (LLMs) have significantly\nenhanced their ability to process long contexts, yet a notable gap remains in\ngenerating long, aligned outputs. This limitation stems from a training gap\nwhere pre-training lacks effective instructions for long-text generation, and\npost-training data primarily consists of short query-response pairs. Current\napproaches, such as instruction backtranslation and behavior imitation, face\nchallenges including data quality, copyright issues, and constraints on\nproprietary model usage. In this paper, we introduce an innovative iterative\ntraining framework called Self-Lengthen that leverages only the intrinsic\nknowledge and skills of LLMs without the need for auxiliary data or proprietary\nmodels. The framework consists of two roles: the Generator and the Extender.\nThe Generator produces the initial response, which is then split and expanded\nby the Extender. This process results in a new, longer response, which is used\nto train both the Generator and the Extender iteratively. Through this process,\nthe models are progressively trained to handle increasingly longer responses.\nExperiments on benchmarks and human evaluations show that Self-Lengthen\noutperforms existing methods in long-text generation, when applied to top\nopen-source LLMs such as Qwen2 and LLaMA3. Our code is publicly available at\nhttps://github.com/QwenLM/Self-Lengthen.\n","authors":["Shanghaoran Quan","Tianyi Tang","Bowen Yu","An Yang","Dayiheng Liu","Bofei Gao","Jianhong Tu","Yichang Zhang","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2410.23933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23918v1","updated":"2024-10-31T13:26:11Z","published":"2024-10-31T13:26:11Z","title":"BitStack: Fine-Grained Size Control for Compressed Large Language Models\n in Variable Memory Environments","summary":" Large language models (LLMs) have revolutionized numerous applications, yet\ntheir deployment remains challenged by memory constraints on local devices.\nWhile scaling laws have enhanced LLM capabilities, the primary bottleneck has\nshifted from \\textit{capability} to \\textit{availability}, emphasizing the need\nfor efficient memory management. Traditional compression methods, such as\nquantization, often require predefined compression ratios and separate\ncompression processes for each setting, complicating deployment in variable\nmemory environments. In this paper, we introduce \\textbf{BitStack}, a novel,\ntraining-free weight compression approach that enables megabyte-level\ntrade-offs between memory usage and model performance. By leveraging weight\ndecomposition, BitStack can dynamically adjust the model size with minimal\ntransmission between running memory and storage devices. Our approach\niteratively decomposes weight matrices while considering the significance of\neach parameter, resulting in an approximately 1-bit per parameter residual\nblock in each decomposition iteration. These blocks are sorted and stacked in\nstorage as basic transmission units, with different quantities loaded based on\ncurrent memory availability. Extensive experiments across a wide range of tasks\ndemonstrate that, despite offering fine-grained size control, BitStack\nconsistently matches or surpasses strong quantization baselines, particularly\nat extreme compression ratios. To the best of our knowledge, this is the first\ndecomposition-based method that effectively bridges the gap to practical\ncompression techniques like quantization. Code is available at\nhttps://github.com/xinghaow99/BitStack.\n","authors":["Xinghao Wang","Pengyu Wang","Bo Wang","Dong Zhang","Yunhua Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.23918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20087v2","updated":"2024-10-31T13:10:12Z","published":"2024-06-28T17:55:24Z","title":"ProgressGym: Alignment with a Millennium of Moral Progress","summary":" Frontier AI systems, including large language models (LLMs), hold increasing\ninfluence over the epistemology of human users. Such influence can reinforce\nprevailing societal values, potentially contributing to the lock-in of\nmisguided moral beliefs and, consequently, the perpetuation of problematic\nmoral practices on a broad scale. We introduce progress alignment as a\ntechnical solution to mitigate this imminent risk. Progress alignment\nalgorithms learn to emulate the mechanics of human moral progress, thereby\naddressing the susceptibility of existing alignment methods to contemporary\nmoral blindspots. To empower research in progress alignment, we introduce\nProgressGym, an experimental framework allowing the learning of moral progress\nmechanics from history, in order to facilitate future progress in real-world\nmoral decisions. Leveraging 9 centuries of historical text and 18 historical\nLLMs, ProgressGym enables codification of real-world progress alignment\nchallenges into concrete benchmarks. Specifically, we introduce three core\nchallenges: tracking evolving values (PG-Follow), preemptively anticipating\nmoral progress (PG-Predict), and regulating the feedback loop between human and\nAI value shifts (PG-Coevolve). Alignment methods without a temporal dimension\nare inapplicable to these tasks. In response, we present lifelong and\nextrapolative algorithms as baseline methods of progress alignment, and build\nan open leaderboard soliciting novel algorithms and challenges. The framework\nand the leaderboard are available at\nhttps://github.com/PKU-Alignment/ProgressGym and\nhttps://huggingface.co/spaces/PKU-Alignment/ProgressGym-LeaderBoard\nrespectively.\n","authors":["Tianyi Qiu","Yang Zhang","Xuchuan Huang","Jasmine Xinze Li","Jiaming Ji","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2406.20087v2.pdf","comment":"NeurIPS 2024 Track on Datasets and Benchmarks (Spotlight)"},{"id":"http://arxiv.org/abs/2407.16434v2","updated":"2024-10-31T13:06:41Z","published":"2024-07-23T12:33:58Z","title":"Enhancing LLM's Cognition via Structurization","summary":" When reading long-form text, human cognition is complex and structurized.\nWhile large language models (LLMs) process input contexts through a causal and\nsequential perspective, this approach can potentially limit their ability to\nhandle intricate and complex inputs effectively. To enhance LLM's cognition\ncapability, this paper presents a novel concept of context structurization.\nSpecifically, we transform the plain, unordered contextual sentences into\nwell-ordered and hierarchically structurized elements. By doing so, LLMs can\nbetter grasp intricate and extended contexts through precise attention and\ninformation-seeking along the organized structures. Extensive evaluations are\nconducted across various model architectures and sizes (including a series of\nauto-regressive LLMs as well as BERT-like masking models) on a diverse set of\nNLP tasks (e.g., context-based question-answering, exhaustive hallucination\nevaluation, and passage-level dense retrieval). Empirical results show\nconsistent and significant performance gains afforded by a single-round\nstructurization. In particular, we boost the open-sourced LLaMA2-70B model to\nachieve comparable performance against GPT-3.5-Turbo as the hallucination\nevaluator. Besides, we show the feasibility of distilling advanced LLMs'\nlanguage processing abilities to a smaller yet effective StruXGPT-7B to execute\nstructurization, addressing the practicality of our approach. Code is available\nat https://github.com/alibaba/struxgpt.\n","authors":["Kai Liu","Zhihang Fu","Chao Chen","Wei Zhang","Rongxin Jiang","Fan Zhou","Yaowu Chen","Yue Wu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2407.16434v2.pdf","comment":"This paper has been accepted by NeurIPS 2024. Code is available at\n https://github.com/alibaba/struxgpt"},{"id":"http://arxiv.org/abs/2410.23902v1","updated":"2024-10-31T13:05:39Z","published":"2024-10-31T13:05:39Z","title":"Responsible Retrieval Augmented Generation for Climate Decision Making\n from Documents","summary":" Climate decision making is constrained by the complexity and inaccessibility\nof key information within lengthy, technical, and multi-lingual documents.\nGenerative AI technologies offer a promising route for improving the\naccessibility of information contained within these documents, but suffer from\nlimitations. These include (1) a tendency to hallucinate or mis-represent\ninformation, (2) difficulty in steering or guaranteeing properties of generated\noutput, and (3) reduced performance in specific technical domains. To address\nthese challenges, we introduce a novel evaluation framework with\ndomain-specific dimensions tailored for climate-related documents. We then\napply this framework to evaluate Retrieval-Augmented Generation (RAG)\napproaches and assess retrieval- and generation-quality within a prototype tool\nthat answers questions about individual climate law and policy documents. In\naddition, we publish a human-annotated dataset and scalable automated\nevaluation tools, with the aim of facilitating broader adoption and robust\nassessment of these systems in the climate domain. Our findings highlight the\nkey components of responsible deployment of RAG to enhance decision-making,\nwhile also providing insights into user experience (UX) considerations for\nsafely deploying such systems to build trust with users in high-risk domains.\n","authors":["Matyas Juhasz","Kalyan Dutia","Henry Franks","Conor Delahunty","Patrick Fawbert Mills","Harrison Pim"],"pdf_url":"https://arxiv.org/pdf/2410.23902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23890v1","updated":"2024-10-31T12:52:26Z","published":"2024-10-31T12:52:26Z","title":"Leveraging LLMs for MT in Crisis Scenarios: a blueprint for low-resource\n languages","summary":" In an evolving landscape of crisis communication, the need for robust and\nadaptable Machine Translation (MT) systems is more pressing than ever,\nparticularly for low-resource languages. This study presents a comprehensive\nexploration of leveraging Large Language Models (LLMs) and Multilingual LLMs\n(MLLMs) to enhance MT capabilities in such scenarios. By focusing on the unique\nchallenges posed by crisis situations where speed, accuracy, and the ability to\nhandle a wide range of languages are paramount, this research outlines a novel\napproach that combines the cutting-edge capabilities of LLMs with fine-tuning\ntechniques and community-driven corpus development strategies. At the core of\nthis study is the development and empirical evaluation of MT systems tailored\nfor two low-resource language pairs, illustrating the process from initial\nmodel selection and fine-tuning through to deployment. Bespoke systems are\ndeveloped and modelled on the recent Covid-19 pandemic. The research highlights\nthe importance of community involvement in creating highly specialised,\ncrisis-specific datasets and compares custom GPTs with NLLB-adapted MLLM\nmodels. It identifies fine-tuned MLLM models as offering superior performance\ncompared with their LLM counterparts. A scalable and replicable model for rapid\nMT system development in crisis scenarios is outlined. Our approach enhances\nthe field of humanitarian technology by offering a blueprint for developing\nmultilingual communication systems during emergencies.\n","authors":["Séamus Lankford","Andy Way"],"pdf_url":"https://arxiv.org/pdf/2410.23890v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.02370,\n arXiv:2403.01580"},{"id":"http://arxiv.org/abs/2410.23884v1","updated":"2024-10-31T12:48:58Z","published":"2024-10-31T12:48:58Z","title":"Failure Modes of LLMs for Causal Reasoning on Narratives","summary":" In this work, we investigate the causal reasoning abilities of large language\nmodels (LLMs) through the representative problem of inferring causal\nrelationships from narratives. We find that even state-of-the-art language\nmodels rely on unreliable shortcuts, both in terms of the narrative\npresentation and their parametric knowledge. For example, LLMs tend to\ndetermine causal relationships based on the topological ordering of events\n(i.e., earlier events cause later ones), resulting in lower performance\nwhenever events are not narrated in their exact causal order. Similarly, we\ndemonstrate that LLMs struggle with long-term causal reasoning and often fail\nwhen the narratives are long and contain many events. Additionally, we show\nLLMs appear to rely heavily on their parametric knowledge at the expense of\nreasoning over the provided narrative. This degrades their abilities whenever\nthe narrative opposes parametric knowledge. We extensively validate these\nfailure modes through carefully controlled synthetic experiments, as well as\nevaluations on real-world narratives. Finally, we observe that explicitly\ngenerating a causal graph generally improves performance while naive\nchain-of-thought is ineffective. Collectively, our results distill precise\nfailure modes of current state-of-the-art models and can pave the way for\nfuture techniques to enhance causal reasoning in LLMs.\n","authors":["Khurram Yamin","Shantanu Gupta","Gaurav R. Ghosal","Zachary C. Lipton","Bryan Wilder"],"pdf_url":"https://arxiv.org/pdf/2410.23884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23883v1","updated":"2024-10-31T12:45:54Z","published":"2024-10-31T12:45:54Z","title":"'No' Matters: Out-of-Distribution Detection in Multimodality Long\n Dialogue","summary":" Out-of-distribution (OOD) detection in multimodal contexts is essential for\nidentifying deviations in combined inputs from different modalities,\nparticularly in applications like open-domain dialogue systems or real-life\ndialogue interactions. This paper aims to improve the user experience that\ninvolves multi-round long dialogues by efficiently detecting OOD dialogues and\nimages. We introduce a novel scoring framework named Dialogue Image Aligning\nand Enhancing Framework (DIAEF) that integrates the visual language models with\nthe novel proposed scores that detect OOD in two key scenarios (1) mismatches\nbetween the dialogue and image input pair and (2) input pairs with previously\nunseen labels. Our experimental results, derived from various benchmarks,\ndemonstrate that integrating image and multi-round dialogue OOD detection is\nmore effective with previously unseen labels than using either modality\nindependently. In the presence of mismatched pairs, our proposed score\neffectively identifies these mismatches and demonstrates strong robustness in\nlong dialogues. This approach enhances domain-aware, adaptive conversational\nagents and establishes baselines for future studies.\n","authors":["Rena Gao","Xuetong Wu","Siwen Luo","Caren Han","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23883v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.02691v2","updated":"2024-10-31T12:40:33Z","published":"2024-10-03T17:18:03Z","title":"On the Proper Treatment of Tokenization in Psycholinguistics","summary":" Language models are widely used in computational psycholinguistics to test\ntheories that relate the negative log probability (the surprisal) of a region\nof interest (a substring of characters) under a language model to its cognitive\ncost experienced by readers, as operationalized, for example, by gaze duration\non the region. However, the application of modern language models to\npsycholinguistic studies is complicated by the practice of using tokenization\nas an intermediate step in training a model. Doing so results in a language\nmodel over token strings rather than one over character strings. Vexingly,\nregions of interest are generally misaligned with these token strings. The\npaper argues that token-level language models should be (approximately)\nmarginalized into character-level language models before they are used in\npsycholinguistic studies to compute the surprisal of a region of interest;\nthen, the marginalized character-level language model can be used to compute\nthe surprisal of an arbitrary character substring, which we term a focal area,\nthat the experimenter may wish to use as a predictor. Our proposal of\nmarginalizing a token-level model into a character-level one solves this\nmisalignment issue independently of the tokenization scheme. Empirically, we\ndiscover various focal areas whose surprisal is a better psychometric predictor\nthan the surprisal of the region of interest itself.\n","authors":["Mario Giulianelli","Luca Malagutti","Juan Luis Gastaldi","Brian DuSell","Tim Vieira","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2410.02691v2.pdf","comment":"Main conference long paper at EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.10245v2","updated":"2024-10-31T12:34:17Z","published":"2024-09-16T12:55:14Z","title":"From Text to Emoji: How PEFT-Driven Personality Manipulation Unleashes\n the Emoji Potential in LLMs","summary":" As the demand for human-like interactions with LLMs continues to grow, so\ndoes the interest in manipulating their personality traits, which has emerged\nas a key area of research. Methods like prompt-based In-Context Knowledge\nEditing (IKE) and gradient-based Model Editor Networks (MEND) have been\nexplored but show irregularity and variability. IKE depends on the prompt,\nleading to variability and sensitivity, while MEND yields inconsistent and\ngibberish outputs. To address this, we employed Opinion QA Based\nParameter-Efficient Fine-Tuning (PEFT), specifically Quantized Low-Rank\nAdaptation (QLoRA), to manipulate the Big Five personality traits: Openness,\nConscientiousness, Extraversion, Agreeableness, and Neuroticism. After PEFT,\nmodels such as Mistral-7B-Instruct and Llama-2-7B-chat began generating emojis,\ndespite their absence in the PEFT data. For instance, Llama-2-7B-chat generated\nemojis in 99.5\\% of extraversion-related test instances, while\nMistral-7B-Instruct did so in 92.5\\% of openness-related test instances.\nExplainability analysis indicated that the LLMs used emojis intentionally to\nexpress these traits. This paper provides a number of novel contributions.\nFirst, introducing an Opinion QA dataset for PEFT-driven personality\nmanipulation; second, developing metric models to benchmark LLM personality\ntraits; third, demonstrating PEFT's superiority over IKE in personality\nmanipulation; and finally, analysing and validating emoji usage through\nexplainability methods such as mechanistic interpretability and in-context\nlearning explainability methods.\n","authors":["Navya Jain","Zekun Wu","Cristian Munoz","Airlie Hilliard","Adriano Koshiyama","Emre Kazim","Philip Treleaven"],"pdf_url":"https://arxiv.org/pdf/2409.10245v2.pdf","comment":"NeurIPS 2024 Workshop on Behavioral Machine Learning"},{"id":"http://arxiv.org/abs/2402.02130v5","updated":"2024-10-31T12:27:33Z","published":"2024-02-03T12:19:47Z","title":"GITA: Graph to Visual and Textual Integration for Vision-Language Graph\n Reasoning","summary":" Large Language Models (LLMs) are increasingly used for various tasks with\ngraph structures. Though LLMs can process graph information in a textual\nformat, they overlook the rich vision modality, which is an intuitive way for\nhumans to comprehend structural information and conduct general graph\nreasoning. The potential benefits and capabilities of representing graph\nstructures as visual images (i.e., $\\textit{visual graph}$) are still\nunexplored. To fill the gap, we innovatively propose an end-to-end framework,\ncalled $\\textbf{G}$raph to v$\\textbf{I}$sual and $\\textbf{T}$extual\nIntegr$\\textbf{A}$tion (GITA), which firstly incorporates visual graphs into\ngeneral graph reasoning. Besides, we establish $\\textbf{G}$raph-based\n$\\textbf{V}$ision-$\\textbf{L}$anguage $\\textbf{Q}$uestion $\\textbf{A}$nswering\n(GVLQA) dataset from existing graph data, which is the first vision-language\ndataset for general graph reasoning purposes. Extensive experiments on the\nGVLQA dataset and five real-world datasets show that GITA outperforms\nmainstream LLMs in terms of general graph reasoning capabilities. Moreover, We\nhighlight the effectiveness of the layout augmentation on visual graphs and\npretraining on the GVLQA dataset.\n","authors":["Yanbin Wei","Shuai Fu","Weisen Jiang","Zejian Zhang","Zhixiong Zeng","Qi Wu","James T. Kwok","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02130v5.pdf","comment":"NeurIPS 2024; Project Page: v-graph.github.io; Code:\n https://github.com/WEIYanbin1999/GITA/"},{"id":"http://arxiv.org/abs/2402.06255v4","updated":"2024-10-31T12:24:14Z","published":"2024-02-09T09:09:39Z","title":"Fight Back Against Jailbreaking via Prompt Adversarial Tuning","summary":" While Large Language Models (LLMs) have achieved tremendous success in\nvarious applications, they are also susceptible to jailbreaking attacks.\nSeveral primary defense strategies have been proposed to protect LLMs from\nproducing harmful information, mostly focusing on model fine-tuning or\nheuristical defense designs. However, how to achieve intrinsic robustness\nthrough prompt optimization remains an open problem. In this paper, motivated\nby adversarial training paradigms for achieving reliable robustness, we propose\nan approach named Prompt Adversarial Tuning (PAT) that trains a prompt control\nattached to the user prompt as a guard prefix. To achieve our defense goal\nwhilst maintaining natural performance, we optimize the control prompt with\nboth adversarial and benign prompts. Comprehensive experiments show that our\nmethod is effective against both grey-box and black-box attacks, reducing the\nsuccess rate of advanced attacks to nearly 0%, while maintaining the model's\nutility on the benign task and incurring only negligible computational\noverhead, charting a new perspective for future explorations in LLM security.\nOur code is available at https://github.com/PKU-ML/PAT.\n","authors":["Yichuan Mo","Yuji Wang","Zeming Wei","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2402.06255v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23861v1","updated":"2024-10-31T12:11:17Z","published":"2024-10-31T12:11:17Z","title":"Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models","summary":" Large Multimodal Models (LMMs) have demonstrated the ability to interact with\nhumans under real-world conditions by combining Large Language Models (LLMs)\nand modality encoders to align multimodal information (visual and auditory)\nwith text. However, such models raise new safety challenges of whether models\nthat are safety-aligned on text also exhibit consistent safeguards for\nmultimodal inputs. Despite recent safety-alignment research on vision LMMs, the\nsafety of audio LMMs remains under-explored. In this work, we comprehensively\nred team the safety of five advanced audio LMMs under three settings: (i)\nharmful questions in both audio and text formats, (ii) harmful questions in\ntext format accompanied by distracting non-speech audio, and (iii)\nspeech-specific jailbreaks. Our results under these settings demonstrate that\nopen-source audio LMMs suffer an average attack success rate of 69.14% on\nharmful audio questions, and exhibit safety vulnerabilities when distracted\nwith non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro\nachieve an attack success rate of 70.67% on the harmful query benchmark. We\nprovide insights on what could cause these reported safety-misalignments.\nWarning: this paper contains offensive examples.\n","authors":["Hao Yang","Lizhen Qu","Ehsan Shareghi","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2410.23861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23856v1","updated":"2024-10-31T12:07:44Z","published":"2024-10-31T12:07:44Z","title":"Can Language Models Perform Robust Reasoning in Chain-of-thought\n Prompting with Noisy Rationales?","summary":" This paper investigates an under-explored challenge in large language models\n(LLMs): chain-of-thought prompting with noisy rationales, which include\nirrelevant or inaccurate reasoning thoughts within examples used for in-context\nlearning. We construct NoRa dataset that is tailored to evaluate the robustness\nof reasoning in the presence of noisy rationales. Our findings on NoRa dataset\nreveal a prevalent vulnerability to such noise among current LLMs, with\nexisting robust methods like self-correction and self-consistency showing\nlimited efficacy. Notably, compared to prompting with clean rationales, base\nLLM drops by 1.4%-19.8% in accuracy with irrelevant thoughts and more\ndrastically by 2.2%-40.4% with inaccurate thoughts.\n Addressing this challenge necessitates external supervision that should be\naccessible in practice. Here, we propose the method of contrastive denoising\nwith noisy chain-of-thought (CD-CoT). It enhances LLMs' denoising-reasoning\ncapabilities by contrasting noisy rationales with only one clean rationale,\nwhich can be the minimal requirement for denoising-purpose prompting. This\nmethod follows a principle of exploration and exploitation: (1) rephrasing and\nselecting rationales in the input space to achieve explicit denoising and (2)\nexploring diverse reasoning paths and voting on answers in the output space.\nEmpirically, CD-CoT demonstrates an average improvement of 17.8% in accuracy\nover the base model and shows significantly stronger denoising capabilities\nthan baseline methods. The source code is publicly available at:\nhttps://github.com/tmlr-group/NoisyRationales.\n","authors":["Zhanke Zhou","Rong Tao","Jianing Zhu","Yiwen Luo","Zengmao Wang","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2410.23856v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23850v1","updated":"2024-10-31T12:01:12Z","published":"2024-10-31T12:01:12Z","title":"The Automated Verification of Textual Claims (AVeriTeC) Shared Task","summary":" The Automated Verification of Textual Claims (AVeriTeC) shared task asks\nparticipants to retrieve evidence and predict veracity for real-world claims\nchecked by fact-checkers. Evidence can be found either via a search engine, or\nvia a knowledge store provided by the organisers. Submissions are evaluated\nusing AVeriTeC score, which considers a claim to be accurately verified if and\nonly if both the verdict is correct and retrieved evidence is considered to\nmeet a certain quality threshold. The shared task received 21 submissions, 18\nof which surpassed our baseline. The winning team was TUDA_MAI with an AVeriTeC\nscore of 63%. In this paper we describe the shared task, present the full\nresults, and highlight key takeaways from the shared task.\n","authors":["Michael Schlichtkrull","Yulong Chen","Chenxi Whitehouse","Zhenyun Deng","Mubashara Akhtar","Rami Aly","Zhijiang Guo","Christos Christodoulopoulos","Oana Cocarascu","Arpit Mittal","James Thorne","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2410.23850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23844v1","updated":"2024-10-31T11:50:24Z","published":"2024-10-31T11:50:24Z","title":"Commonsense Knowledge Editing Based on Free-Text in LLMs","summary":" Knowledge editing technology is crucial for maintaining the accuracy and\ntimeliness of large language models (LLMs) . However, the setting of this task\noverlooks a significant portion of commonsense knowledge based on free-text in\nthe real world, characterized by broad knowledge scope, long content and non\ninstantiation. The editing objects of previous methods (e.g., MEMIT) were\nsingle token or entity, which were not suitable for commonsense knowledge in\nfree-text form. To address the aforementioned challenges, we conducted\nexperiments from two perspectives: knowledge localization and knowledge\nediting. Firstly, we introduced Knowledge Localization for Free-Text(KLFT)\nmethod, revealing the challenges associated with the distribution of\ncommonsense knowledge in MLP and Attention layers, as well as in decentralized\ndistribution. Next, we propose a Dynamics-aware Editing Method(DEM), which\nutilizes a Dynamics-aware Module to locate the parameter positions\ncorresponding to commonsense knowledge, and uses Knowledge Editing Module to\nupdate knowledge. The DEM method fully explores the potential of the MLP and\nAttention layers, and successfully edits commonsense knowledge based on\nfree-text. The experimental results indicate that the DEM can achieve excellent\nediting performance.\n","authors":["Xiusheng Huang","Yequan Wang","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23844v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23843v1","updated":"2024-10-31T11:49:44Z","published":"2024-10-31T11:49:44Z","title":"Reasons and Solutions for the Decline in Model Performance after Editing","summary":" Knowledge editing technology has received widespread attention for low-cost\nupdates of incorrect or outdated knowledge in large-scale language models.\nHowever, recent research has found that edited models often exhibit varying\ndegrees of performance degradation. The reasons behind this phenomenon and\npotential solutions have not yet been provided. In order to investigate the\nreasons for the performance decline of the edited model and optimize the\nediting method, this work explores the underlying reasons from both data and\nmodel perspectives. Specifically, 1) from a data perspective, to clarify the\nimpact of data on the performance of editing models, this paper first\nconstructs a Multi-Question Dataset (MQD) to evaluate the impact of different\ntypes of editing data on model performance. The performance of the editing\nmodel is mainly affected by the diversity of editing targets and sequence\nlength, as determined through experiments. 2) From a model perspective, this\narticle explores the factors that affect the performance of editing models. The\nresults indicate a strong correlation between the L1-norm of the editing model\nlayer and the editing accuracy, and clarify that this is an important factor\nleading to the bottleneck of editing performance. Finally, in order to improve\nthe performance of the editing model, this paper further proposes a Dump for\nSequence (D4S) method, which successfully overcomes the previous editing\nbottleneck by reducing the L1-norm of the editing layer, allowing users to\nperform multiple effective edits and minimizing model damage. Our code is\navailable at https://github.com/nlpkeg/D4S.\n","authors":["Xiusheng Huang","Jiaxiang Liu","Yequan Wang","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23843v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.17557v2","updated":"2024-10-31T11:37:49Z","published":"2024-06-25T13:50:56Z","title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at\n Scale","summary":" The performance of a large language model (LLM) depends heavily on the\nquality and size of its pretraining dataset. However, the pretraining datasets\nfor state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly\navailable and very little is known about how they were created. In this work,\nwe introduce FineWeb, a 15-trillion token dataset derived from 96 Common Crawl\nsnapshots that produces better-performing LLMs than other open pretraining\ndatasets. To advance the understanding of how best to curate high-quality\npretraining datasets, we carefully document and ablate all of the design\nchoices used in FineWeb, including in-depth investigations of deduplication and\nfiltering strategies. In addition, we introduce FineWeb-Edu, a 1.3-trillion\ntoken collection of educational text filtered from FineWeb. LLMs pretrained on\nFineWeb-Edu exhibit dramatically better performance on knowledge- and\nreasoning-intensive benchmarks like MMLU and ARC. Along with our datasets, we\npublicly release our data curation codebase and all of the models trained\nduring our ablation experiments.\n","authors":["Guilherme Penedo","Hynek Kydlíček","Loubna Ben allal","Anton Lozhkov","Margaret Mitchell","Colin Raffel","Leandro Von Werra","Thomas Wolf"],"pdf_url":"https://arxiv.org/pdf/2406.17557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02370v4","updated":"2024-10-31T11:37:41Z","published":"2024-02-04T06:59:21Z","title":"AutoTimes: Autoregressive Time Series Forecasters via Large Language\n Models","summary":" Foundation models of time series have not been fully developed due to the\nlimited availability of time series corpora and the underexploration of\nscalable pre-training. Based on the similar sequential formulation of time\nseries and natural language, increasing research demonstrates the feasibility\nof leveraging large language models (LLM) for time series. Nevertheless, the\ninherent autoregressive property and decoder-only architecture of LLMs have not\nbeen fully considered, resulting in insufficient utilization of LLM abilities.\nTo fully revitalize the general-purpose token transition and multi-step\ngeneration capability of large language models, we propose AutoTimes to\nrepurpose LLMs as autoregressive time series forecasters, which projects time\nseries into the embedding space of language tokens and autoregressively\ngenerates future predictions with arbitrary lengths. Compatible with any\ndecoder-only LLMs, the consequent forecaster exhibits the flexibility of the\nlookback length and scalability with larger LLMs. Further, we formulate time\nseries as prompts, extending the context for prediction beyond the lookback\nwindow, termed in-context forecasting. By introducing LLM-embedded textual\ntimestamps, AutoTimes can utilize chronological information to align\nmultivariate time series. Empirically, AutoTimes achieves state-of-the-art with\n0.1% trainable parameters and over $5\\times$ training/inference speedup\ncompared to advanced LLM-based forecasters. Code is available at this\nrepository: https://github.com/thuml/AutoTimes.\n","authors":["Yong Liu","Guo Qin","Xiangdong Huang","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2402.02370v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23825v1","updated":"2024-10-31T11:14:12Z","published":"2024-10-31T11:14:12Z","title":"GlotCC: An Open Broad-Coverage CommonCrawl Corpus and Pipeline for\n Minority Languages","summary":" The need for large text corpora has increased with the advent of pretrained\nlanguage models and, in particular, the discovery of scaling laws for these\nmodels. Most available corpora have sufficient data only for languages with\nlarge dominant communities. However, there is no corpus available that (i)\ncovers a wide range of minority languages; (ii) is generated by an open-source\nreproducible pipeline; and (iii) is rigorously cleaned from noise, making it\ntrustworthy to use. We present GlotCC, a clean, document-level, 2TB general\ndomain corpus derived from CommonCrawl, covering more than 1000 languages. We\nmake GlotCC and the system used to generate it - including the pipeline,\nlanguage identification model, and filters - available to the research\ncommunity. Corpus v. 1.0 https://huggingface.co/datasets/cis-lmu/GlotCC-v1,\nPipeline v. 3.0 https://github.com/cisnlp/GlotCC.\n","authors":["Amir Hossein Kargaran","François Yvon","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2410.23825v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.08126v2","updated":"2024-10-31T11:11:18Z","published":"2024-10-10T17:10:34Z","title":"Mars: Situated Inductive Reasoning in an Open-World Environment","summary":" Large Language Models (LLMs) trained on massive corpora have shown remarkable\nsuccess in knowledge-intensive tasks. Yet, most of them rely on pre-stored\nknowledge. Inducing new general knowledge from a specific environment and\nperforming reasoning with the acquired knowledge -- \\textit{situated inductive\nreasoning}, is crucial and challenging for machine intelligence. In this paper,\nwe design Mars, an interactive environment devised for situated inductive\nreasoning. It introduces counter-commonsense game mechanisms by modifying\nterrain, survival setting and task dependency while adhering to certain\nprinciples. In Mars, agents need to actively interact with their surroundings,\nderive useful rules and perform decision-making tasks in specific contexts. We\nconduct experiments on various RL-based and LLM-based methods, finding that\nthey all struggle on this challenging situated inductive reasoning benchmark.\nFurthermore, we explore \\textit{Induction from Reflection}, where we instruct\nagents to perform inductive reasoning from history trajectory. The superior\nperformance underscores the importance of inductive reasoning in Mars. Through\nMars, we aim to galvanize advancements in situated inductive reasoning and set\nthe stage for developing the next generation of AI systems that can reason in\nan adaptive and context-sensitive way.\n","authors":["Xiaojuan Tang","Jiaqi Li","Yitao Liang","Song-chun Zhu","Muhan Zhang","Zilong Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.08126v2.pdf","comment":"Accepted by NeurIPS 2024 Track Datasets and Benchmarks. Project page:\n https://marscrafter.github.io/"},{"id":"http://arxiv.org/abs/2409.04181v2","updated":"2024-10-31T11:01:16Z","published":"2024-09-06T10:49:46Z","title":"Combining LLMs and Knowledge Graphs to Reduce Hallucinations in Question\n Answering","summary":" Advancements in natural language processing have revolutionized the way we\ncan interact with digital information systems, such as databases, making them\nmore accessible. However, challenges persist, especially when accuracy is\ncritical, as in the biomedical domain. A key issue is the hallucination\nproblem, where models generate information unsupported by the underlying data,\npotentially leading to dangerous misinformation. This paper presents a novel\napproach designed to bridge this gap by combining Large Language Models (LLM)\nand Knowledge Graphs (KG) to improve the accuracy and reliability of\nquestion-answering systems, on the example of a biomedical KG. Built on the\nLangChain framework, our method incorporates a query checker that ensures the\nsyntactical and semantic validity of LLM-generated queries, which are then used\nto extract information from a Knowledge Graph, substantially reducing errors\nlike hallucinations. We evaluated the overall performance using a new benchmark\ndataset of 50 biomedical questions, testing several LLMs, including GPT-4 Turbo\nand llama3:70b. Our results indicate that while GPT-4 Turbo outperforms other\nmodels in generating accurate queries, open-source models like llama3:70b show\npromise with appropriate prompt engineering. To make this approach accessible,\na user-friendly web-based interface has been developed, allowing users to input\nnatural language queries, view generated and corrected Cypher queries, and\nverify the resulting paths for accuracy. Overall, this hybrid approach\neffectively addresses common issues such as data gaps and hallucinations,\noffering a reliable and intuitive solution for question answering systems. The\nsource code for generating the results of this paper and for the user-interface\ncan be found in our Git repository: https://git.zib.de/lpusch/cyphergenkg-gui\n","authors":["Larissa Pusch","Tim O. F. Conrad"],"pdf_url":"https://arxiv.org/pdf/2409.04181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11200v3","updated":"2024-10-31T10:15:06Z","published":"2024-06-17T04:20:02Z","title":"AvaTaR: Optimizing LLM Agents for Tool Usage via Contrastive Reasoning","summary":" Large language model (LLM) agents have demonstrated impressive capabilities\nin utilizing external tools and knowledge to boost accuracy and reduce\nhallucinations. However, developing prompting techniques that enable LLM agents\nto effectively use these tools and knowledge remains a heuristic and\nlabor-intensive task. Here, we introduce AvaTaR, a novel and automated\nframework that optimizes an LLM agent to effectively leverage provided tools,\nimproving performance on a given task. During optimization, we design a\ncomparator module to iteratively deliver insightful and comprehensive prompts\nto the LLM agent by contrastively reasoning between positive and negative\nexamples sampled from training data. We demonstrate AvaTaR on four complex\nmultimodal retrieval datasets featuring textual, visual, and relational\ninformation, and three general question-answering (QA) datasets. We find AvaTaR\nconsistently outperforms state-of-the-art approaches across all seven tasks,\nexhibiting strong generalization ability when applied to novel cases and\nachieving an average relative improvement of 14% on the Hit@1 metric for the\nretrieval datasets and 13% for the QA datasets. Code and dataset are available\nat https://github.com/zou-group/avatar.\n","authors":["Shirley Wu","Shiyu Zhao","Qian Huang","Kexin Huang","Michihiro Yasunaga","Kaidi Cao","Vassilis N. Ioannidis","Karthik Subbian","Jure Leskovec","James Zou"],"pdf_url":"https://arxiv.org/pdf/2406.11200v3.pdf","comment":"NeurIPS 2024 main conference"},{"id":"http://arxiv.org/abs/2406.06196v3","updated":"2024-10-31T10:14:49Z","published":"2024-06-10T11:50:29Z","title":"LINGOLY: A Benchmark of Olympiad-Level Linguistic Reasoning Puzzles in\n Low-Resource and Extinct Languages","summary":" In this paper, we present the LingOly benchmark, a novel benchmark for\nadvanced reasoning abilities in large language models. Using challenging\nLinguistic Olympiad puzzles, we evaluate (i) capabilities for in-context\nidentification and generalisation of linguistic patterns in very low-resource\nor extinct languages, and (ii) abilities to follow complex task instructions.\nThe LingOly benchmark covers more than 90 mostly low-resource languages,\nminimising issues of data contamination, and contains 1,133 problems across 6\nformats and 5 levels of human difficulty. We assess performance with both\ndirect accuracy and comparison to a no-context baseline to penalise\nmemorisation. Scores from 11 state-of-the-art LLMs demonstrate the benchmark to\nbe challenging, and models perform poorly on the higher difficulty problems. On\nharder problems, even the top model only achieved 38.7% accuracy, a 24.7%\nimprovement over the no-context baseline. Large closed models typically\noutperform open models, and in general, the higher resource the language, the\nbetter the scores. These results indicate, in absence of memorisation, true\nmulti-step out-of-domain reasoning remains a challenge for current language\nmodels.\n","authors":["Andrew M. Bean","Simi Hellsten","Harry Mayne","Jabez Magomere","Ethan A. Chi","Ryan Chi","Scott A. Hale","Hannah Rose Kirk"],"pdf_url":"https://arxiv.org/pdf/2406.06196v3.pdf","comment":"Oral presentation at NeurIPS 2024 Datasets and Benchmarks Track. 10\n pages, 5 figures, 22 pages supplemental materials"},{"id":"http://arxiv.org/abs/2403.05266v2","updated":"2024-10-31T10:07:54Z","published":"2024-03-08T12:42:36Z","title":"ERBench: An Entity-Relationship based Automatically Verifiable\n Hallucination Benchmark for Large Language Models","summary":" Large language models (LLMs) have achieved unprecedented performances in\nvarious applications, yet evaluating them is still challenging. Existing\nbenchmarks are either manually constructed or are automatic, but lack the\nability to evaluate the thought process of LLMs with arbitrary complexity. We\ncontend that utilizing existing relational databases based on the\nentity-relationship (ER) model is a promising approach for constructing\nbenchmarks as they contain structured knowledge that can be used to question\nLLMs. Unlike knowledge graphs, which are also used to evaluate LLMs, relational\ndatabases have integrity constraints that can be used to better construct\ncomplex in-depth questions and verify answers: (1) functional dependencies can\nbe used to pinpoint critical keywords that an LLM must know to properly answer\na given question containing certain attribute values; and (2) foreign key\nconstraints can be used to join relations and construct multi-hop questions,\nwhich can be arbitrarily long and used to debug intermediate answers. We thus\npropose ERBench, which uses these integrity constraints to convert any database\ninto an LLM benchmark. ERBench supports continuous evaluation as databases\nchange, multimodal questions, and various prompt engineering techniques. In our\nexperiments, we construct LLM benchmarks using databases of multiple domains\nand make an extensive comparison of contemporary LLMs. We show how ERBench can\nproperly evaluate any LLM by not only checking for answer correctness, but also\neffectively verifying the rationales by looking for the right keywords.\n","authors":["Jio Oh","Soyeon Kim","Junseok Seo","Jindong Wang","Ruochen Xu","Xing Xie","Steven Euijong Whang"],"pdf_url":"https://arxiv.org/pdf/2403.05266v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23771v1","updated":"2024-10-31T09:39:28Z","published":"2024-10-31T09:39:28Z","title":"What is Wrong with Perplexity for Long-context Language Modeling?","summary":" Handling long-context inputs is crucial for large language models (LLMs) in\ntasks such as extended conversations, document summarization, and many-shot\nin-context learning. While recent approaches have extended the context windows\nof LLMs and employed perplexity (PPL) as a standard evaluation metric, PPL has\nproven unreliable for assessing long-context capabilities. The underlying cause\nof this limitation has remained unclear. In this work, we provide a\ncomprehensive explanation for this issue. We find that PPL overlooks key\ntokens, which are essential for long-context understanding, by averaging across\nall tokens and thereby obscuring the true performance of models in long-context\nscenarios. To address this, we propose \\textbf{LongPPL}, a novel metric that\nfocuses on key tokens by employing a long-short context contrastive method to\nidentify them. Our experiments demonstrate that LongPPL strongly correlates\nwith performance on various long-context benchmarks (e.g., Pearson correlation\nof -0.96), significantly outperforming traditional PPL in predictive accuracy.\nAdditionally, we introduce \\textbf{LongCE} (Long-context Cross-Entropy) loss, a\nre-weighting strategy for fine-tuning that prioritizes key tokens, leading to\nconsistent improvements across diverse benchmarks. In summary, these\ncontributions offer deeper insights into the limitations of PPL and present\neffective solutions for accurately evaluating and enhancing the long-context\ncapabilities of LLMs. Code is available at https://github.com/PKU-ML/LongPPL.\n","authors":["Lizhe Fang","Yifei Wang","Zhaoyang Liu","Chenheng Zhang","Stefanie Jegelka","Jinyang Gao","Bolin Ding","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23769v1","updated":"2024-10-31T09:33:37Z","published":"2024-10-31T09:33:37Z","title":"The Potential of LLMs in Medical Education: Generating Questions and\n Answers for Qualification Exams","summary":" Recent research on large language models (LLMs) has primarily focused on\ntheir adaptation and application in specialized domains. The application of\nLLMs in the medical field is mainly concentrated on tasks such as the\nautomation of medical report generation, summarization, diagnostic reasoning,\nand question-and-answer interactions between doctors and patients. The\nchallenge of becoming a good teacher is more formidable than that of becoming a\ngood student, and this study pioneers the application of LLMs in the field of\nmedical education. In this work, we investigate the extent to which LLMs can\ngenerate medical qualification exam questions and corresponding answers based\non few-shot prompts. Utilizing a real-world Chinese dataset of elderly chronic\ndiseases, we tasked the LLMs with generating open-ended questions and answers\nbased on a subset of sampled admission reports across eight widely used LLMs,\nincluding ERNIE 4, ChatGLM 4, Doubao, Hunyuan, Spark 4, Qwen, Llama 3, and\nMistral. Furthermore, we engaged medical experts to manually evaluate these\nopen-ended questions and answers across multiple dimensions. The study found\nthat LLMs, after using few-shot prompts, can effectively mimic real-world\nmedical qualification exam questions, whereas there is room for improvement in\nthe correctness, evidence-based statements, and professionalism of the\ngenerated answers. Moreover, LLMs also demonstrate a decent level of ability to\ncorrect and rectify reference answers. Given the immense potential of\nartificial intelligence in the medical field, the task of generating questions\nand answers for medical qualification exams aimed at medical students, interns\nand residents can be a significant focus of future research.\n","authors":["Yunqi Zhu","Wen Tang","Ying Sun","Xuebing Yang"],"pdf_url":"https://arxiv.org/pdf/2410.23769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03621v2","updated":"2024-10-31T09:11:03Z","published":"2024-09-05T15:33:24Z","title":"Attend First, Consolidate Later: On the Importance of Attention in\n Different LLM Layers","summary":" In decoder-based LLMs, the representation of a given layer serves two\npurposes: as input to the next layer during the computation of the current\ntoken; and as input to the attention mechanism of future tokens. In this work,\nwe show that the importance of the latter role might be overestimated. To show\nthat, we start by manipulating the representations of previous tokens; e.g. by\nreplacing the hidden states at some layer k with random vectors. Our\nexperimenting with four LLMs and four tasks show that this operation often\nleads to small to negligible drop in performance. Importantly, this happens if\nthe manipulation occurs in the top part of the model-k is in the final 30-50%\nof the layers. In contrast, doing the same manipulation in earlier layers might\nlead to chance level performance. We continue by switching the hidden state of\ncertain tokens with hidden states of other tokens from another prompt; e.g.,\nreplacing the word \"Italy\" with \"France\" in \"What is the capital of Italy?\". We\nfind that when applying this switch in the top 1/3 of the model, the model\nignores it (answering \"Rome\"). However if we apply it before, the model\nconforms to the switch (\"Paris\"). Our results hint at a two stage process in\ntransformer-based LLMs: the first part gathers input from previous tokens,\nwhile the second mainly processes that information internally.\n","authors":["Amit Ben-Artzy","Roy Schwartz"],"pdf_url":"https://arxiv.org/pdf/2409.03621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23746v1","updated":"2024-10-31T09:01:25Z","published":"2024-10-31T09:01:25Z","title":"DetectRL: Benchmarking LLM-Generated Text Detection in Real-World\n Scenarios","summary":" Detecting text generated by large language models (LLMs) is of great recent\ninterest. With zero-shot methods like DetectGPT, detection capabilities have\nreached impressive levels. However, the reliability of existing detectors in\nreal-world applications remains underexplored. In this study, we present a new\nbenchmark, DetectRL, highlighting that even state-of-the-art (SOTA) detection\ntechniques still underperformed in this task. We collected human-written\ndatasets from domains where LLMs are particularly prone to misuse. Using\npopular LLMs, we generated data that better aligns with real-world\napplications. Unlike previous studies, we employed heuristic rules to create\nadversarial LLM-generated text, simulating advanced prompt usages, human\nrevisions like word substitutions, and writing errors. Our development of\nDetectRL reveals the strengths and limitations of current SOTA detectors. More\nimportantly, we analyzed the potential impact of writing styles, model types,\nattack methods, the text lengths, and real-world human writing factors on\ndifferent types of detectors. We believe DetectRL could serve as an effective\nbenchmark for assessing detectors in real-world scenarios, evolving with\nadvanced attack methods, thus providing more stressful evaluation to drive the\ndevelopment of more efficient detectors. Data and code are publicly available\nat: https://github.com/NLP2CT/DetectRL.\n","authors":["Junchao Wu","Runzhe Zhan","Derek F. Wong","Shu Yang","Xinyi Yang","Yulin Yuan","Lidia S. Chao"],"pdf_url":"https://arxiv.org/pdf/2410.23746v1.pdf","comment":"Accepted to NeurIPS 2024 Dataset & Benchmarking Track"},{"id":"http://arxiv.org/abs/2410.23743v1","updated":"2024-10-31T08:58:06Z","published":"2024-10-31T08:58:06Z","title":"What Happened in LLMs Layers when Trained for Fast vs. Slow Thinking: A\n Gradient Perspective","summary":" What makes a difference in the post-training of LLMs? We investigate the\ntraining patterns of different layers in large language models (LLMs), through\nthe lens of gradient, when training with different responses and initial\nmodels. We are specifically interested in how fast vs. slow thinking affects\nthe layer-wise gradients, given the recent popularity of training LLMs on\nreasoning paths such as chain-of-thoughts (CoT) and process rewards. In our\nstudy, fast thinking without CoT leads to larger gradients and larger\ndifferences of gradients across layers than slow thinking (Detailed CoT),\nindicating the learning stability brought by the latter. Moreover, pre-trained\nLLMs are less affected by the instability of fast thinking than\ninstruction-tuned LLMs. Additionally, we study whether the gradient patterns\ncan reflect the correctness of responses when training different LLMs using\nslow vs. fast thinking paths. The results show that the gradients of slow\nthinking can distinguish correct and irrelevant reasoning paths. As a\ncomparison, we conduct similar gradient analyses on non-reasoning knowledge\nlearning tasks, on which, however, trivially increasing the response length\ndoes not lead to similar behaviors of slow thinking. Our study strengthens\nfundamental understandings of LLM training and sheds novel insights on its\nefficiency and stability, which pave the way towards building a generalizable\nSystem-2 agent. Our code, data, and gradient statistics can be found in:\nhttps://github.com/MingLiiii/Layer_Gradient.\n","authors":["Ming Li","Yanhong Li","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14405v2","updated":"2024-10-31T08:44:13Z","published":"2024-10-18T12:08:07Z","title":"Fact Recall, Heuristics or Pure Guesswork? Precise Interpretations of\n Language Models for Fact Completion","summary":" Previous interpretations of language models (LMs) miss important distinctions\nin how these models process factual information. For example, given the query\n\"Astrid Lindgren was born in\" with the corresponding completion \"Sweden\", no\ndifference is made between whether the prediction was based on having the exact\nknowledge of the birthplace of the Swedish author or assuming that a person\nwith a Swedish-sounding name was born in Sweden. In this paper, we investigate\nfour different prediction scenarios for which the LM can be expected to show\ndistinct behaviors. These scenarios correspond to different levels of model\nreliability and types of information being processed - some being less\ndesirable for factual predictions. To facilitate precise interpretations of LMs\nfor fact completion, we propose a model-specific recipe called PrISM for\nconstructing datasets with examples of each scenario based on a set of\ndiagnostic criteria. We apply a popular interpretability method, causal tracing\n(CT), to the four prediction scenarios and find that while CT produces\ndifferent results for each scenario, aggregations over a set of mixed examples\nmay only represent the results from the scenario with the strongest measured\nsignal. In summary, we contribute tools for a more granular study of fact\ncompletion in language models and analyses that provide a more nuanced\nunderstanding of how LMs process fact-related queries.\n","authors":["Denitsa Saynova","Lovisa Hagström","Moa Johansson","Richard Johansson","Marco Kuhlmann"],"pdf_url":"https://arxiv.org/pdf/2410.14405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00072v5","updated":"2024-10-31T08:35:42Z","published":"2024-06-21T08:52:11Z","title":"Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback","summary":" RAG systems face limitations when semantic relevance alone does not guarantee\nimproved generation quality. This issue becomes particularly evident due to the\nsensitivity of large language models (LLMs) to the ordering of few-shot\nprompts, which can affect model performance. To address this challenge,\naligning LLM outputs with human preferences using structured feedback, such as\noptions to copy, regenerate, or dislike, offers a promising method for\nimprovement. This feedback is applied to the entire list of inputs rather than\ngiving specific ratings for individual documents, making it a Listwide Labels\nLearning-to-Rank task.\n To address this task, we propose Pistis-RAG, a new RAG framework designed\nwith a content-centric approach to better align LLMs with human preferences.\nPistis-RAG effectively utilizes human feedback, enhancing content ranking and\ngeneration quality. To validate our framework, we use public datasets to\nsimulate human feedback, allowing us to evaluate and refine our method\neffectively. Experimental results indicate that Pistis-RAG improves alignment\nwith human preferences relative to the baseline RAG system, showing a 6.06%\nincrease in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy\nmetrics. These results highlight Pistis-RAG's effectiveness in overcoming the\nlimitations associated with traditional RAG approaches.\n","authors":["Yu Bai","Yukai Miao","Li Chen","Dawei Wang","Dan Li","Yanyu Ren","Hongtao Xie","Ce Yang","Xuhui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.00072v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23728v1","updated":"2024-10-31T08:30:55Z","published":"2024-10-31T08:30:55Z","title":"GigaCheck: Detecting LLM-generated Content","summary":" With the increasing quality and spread of LLM-based assistants, the amount of\nartificially generated content is growing rapidly. In many cases and tasks,\nsuch texts are already indistinguishable from those written by humans, and the\nquality of generation tends to only increase. At the same time, detection\nmethods are developing more slowly, making it challenging to prevent misuse of\nthese technologies.\n In this work, we investigate the task of generated text detection by\nproposing the GigaCheck. Our research explores two approaches: (i)\ndistinguishing human-written texts from LLM-generated ones, and (ii) detecting\nLLM-generated intervals in Human-Machine collaborative texts. For the first\ntask, our approach utilizes a general-purpose LLM, leveraging its extensive\nlanguage abilities to fine-tune efficiently for the downstream task of\nLLM-generated text detection, achieving high performance even with limited\ndata. For the second task, we propose a novel approach that combines computer\nvision and natural language processing techniques. Specifically, we use a\nfine-tuned general-purpose LLM in conjunction with a DETR-like detection model,\nadapted from computer vision, to localize artificially generated intervals\nwithin text.\n We evaluate the GigaCheck on five classification datasets with English texts\nand three datasets designed for Human-Machine collaborative text analysis. Our\nresults demonstrate that GigaCheck outperforms previous methods, even in\nout-of-distribution settings, establishing a strong baseline across all\ndatasets.\n","authors":["Irina Tolstykh","Aleksandra Tsybina","Sergey Yakubson","Aleksandr Gordeev","Vladimir Dokholyan","Maksim Kuprashevich"],"pdf_url":"https://arxiv.org/pdf/2410.23728v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2410.23725v1","updated":"2024-10-31T08:24:37Z","published":"2024-10-31T08:24:37Z","title":"Artificial intelligence to improve clinical coding practice in\n Scandinavia: a crossover randomized controlled trial","summary":" \\textbf{Trial design} Crossover randomized controlled trial. \\textbf{Methods}\nAn AI tool, Easy-ICD, was developed to assist clinical coders and was tested\nfor improving both accuracy and time in a user study in Norway and Sweden.\nParticipants were randomly assigned to two groups, and crossed over between\ncoding complex (longer) texts versus simple (shorter) texts, while using our\ntool versus not using our tool. \\textbf{Results} Based on Mann-Whitney U test,\nthe median coding time difference for complex clinical text sequences was 123\nseconds (\\emph{P}\\textless.001, 95\\% CI: 81 to 164), representing a 46\\%\nreduction in median coding time when our tool is used. There was no significant\ntime difference for simpler text sequences. For coding accuracy, the\nimprovement we noted for both complex and simple texts was not significant.\n\\textbf{Conclusions} This study demonstrates the potential of AI to transform\ncommon tasks in clinical workflows, with ostensible positive impacts on work\nefficiencies for complex clinical coding tasks. Further studies within hospital\nworkflows are required before these presumed impacts can be more clearly\nunderstood.\n","authors":["Taridzo Chomutare","Therese Olsen Svenning","Miguel Ángel Tejedor Hernández","Phuong Dinh Ngo","Andrius Budrionis","Kaisa Markljung","Lill Irene Hind","Torbjørn Torsvik","Karl Øyvind Mikalsen","Aleksandar Babic","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2410.23725v1.pdf","comment":"13 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.03978v3","updated":"2024-10-31T08:11:04Z","published":"2024-07-04T14:50:45Z","title":"Benchmarking Complex Instruction-Following with Multiple Constraints\n Composition","summary":" Instruction following is one of the fundamental capabilities of large\nlanguage models (LLMs). As the ability of LLMs is constantly improving, they\nhave been increasingly applied to deal with complex human instructions in\nreal-world scenarios. Therefore, how to evaluate the ability of complex\ninstruction-following of LLMs has become a critical research problem. Existing\nbenchmarks mainly focus on modeling different types of constraints in human\ninstructions while neglecting the composition of different constraints, which\nis an indispensable constituent in complex instructions. To this end, we\npropose ComplexBench, a benchmark for comprehensively evaluating the ability of\nLLMs to follow complex instructions composed of multiple constraints. We\npropose a hierarchical taxonomy for complex instructions, including 4\nconstraint types, 19 constraint dimensions, and 4 composition types, and\nmanually collect a high-quality dataset accordingly. To make the evaluation\nreliable, we augment LLM-based evaluators with rules to effectively verify\nwhether generated texts can satisfy each constraint and composition.\nFurthermore, we obtain the final evaluation score based on the dependency\nstructure determined by different composition types. ComplexBench identifies\nsignificant deficiencies in existing LLMs when dealing with complex\ninstructions with multiple constraints composition.\n","authors":["Bosi Wen","Pei Ke","Xiaotao Gu","Lindong Wu","Hao Huang","Jinfeng Zhou","Wenchuang Li","Binxin Hu","Wendy Gao","Jiaxin Xu","Yiming Liu","Jie Tang","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2407.03978v3.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2410.23703v1","updated":"2024-10-31T07:48:44Z","published":"2024-10-31T07:48:44Z","title":"OCEAN: Offline Chain-of-thought Evaluation and Alignment in Large\n Language Models","summary":" Offline evaluation of LLMs is crucial in understanding their capacities,\nthough current methods remain underexplored in existing research. In this work,\nwe focus on the offline evaluation of the chain-of-thought capabilities and\nshow how to optimize LLMs based on the proposed evaluation method. To enable\noffline feedback with rich knowledge and reasoning paths, we use knowledge\ngraphs (e.g., Wikidata5m) to provide feedback on the generated chain of\nthoughts. Due to the heterogeneity between LLM reasoning and KG structures,\ndirect interaction and feedback from KGs on LLM behavior are challenging, as\nthey require accurate entity linking and grounding of LLM-generated chains of\nthought in the KG. To address the above challenge, we propose an offline\nchain-of-thought evaluation framework, OCEAN, which models chain-of-thought\nreasoning in LLMs as an MDP and evaluate the policy's alignment with KG\npreference modeling. To overcome the reasoning heterogeneity and grounding\nproblems, we leverage on-policy KG exploration and RL to model a KG policy that\ngenerates token-level likelihood distributions for LLM-generated\nchain-of-thought reasoning paths, simulating KG reasoning preference. Then we\nincorporate the knowledge-graph feedback on the validity and alignment of the\ngenerated reasoning paths into inverse propensity scores and propose KG-IPS\nestimator. Theoretically, we prove the unbiasedness of the proposed KG-IPS\nestimator and provide a lower bound on its variance. With the off-policy\nevaluated value function, we can directly enable off-policy optimization to\nfurther enhance chain-of-thought alignment. Our empirical study shows that\nOCEAN can be efficiently optimized for generating chain-of-thought reasoning\npaths with higher estimated values without affecting LLMs' general abilities in\ndownstream tasks or their internal knowledge.\n","authors":["Junda Wu","Xintong Li","Ruoyu Wang","Yu Xia","Yuxin Xiong","Jianing Wang","Tong Yu","Xiang Chen","Branislav Kveton","Lina Yao","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2410.23703v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.07933v2","updated":"2024-10-31T07:36:39Z","published":"2024-06-12T06:56:20Z","title":"Large Language Model Unlearning via Embedding-Corrupted Prompts","summary":" Large language models (LLMs) have advanced to encompass extensive knowledge\nacross diverse domains. Yet controlling what a large language model should not\nknow is important for ensuring alignment and thus safe use. However, accurately\nand efficiently unlearning knowledge from an LLM remains challenging due to the\npotential collateral damage caused by the fuzzy boundary between retention and\nforgetting, and the large computational requirements for optimization across\nstate-of-the-art models with hundreds of billions of parameters. In this work,\nwe present \\textbf{Embedding-COrrupted (ECO) Prompts}, a lightweight unlearning\nframework for large language models to address both the challenges of knowledge\nentanglement and unlearning efficiency. Instead of relying on the LLM itself to\nunlearn, we enforce an unlearned state during inference by employing a prompt\nclassifier to identify and safeguard prompts to forget. We learn corruptions\nadded to prompt embeddings via zeroth order optimization toward the unlearning\nobjective offline and corrupt prompts flagged by the classifier during\ninference. We find that these embedding-corrupted prompts not only lead to\ndesirable outputs that satisfy the unlearning objective but also closely\napproximate the output from a model that has never been trained on the data\nintended for forgetting. Through extensive experiments on unlearning, we\ndemonstrate the superiority of our method in achieving promising unlearning at\n\\textit{nearly zero side effects} in general domains and domains closely\nrelated to the unlearned ones. Additionally, we highlight the scalability of\nour method to 100 LLMs, ranging from 0.5B to 236B parameters, incurring no\nadditional cost as the number of parameters increases. We have made our code\npublicly available at \\url{https://github.com/chrisliu298/llm-unlearn-eco}.\n","authors":["Chris Yuhao Liu","Yaxuan Wang","Jeffrey Flanigan","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.07933v2.pdf","comment":"NeurIPS 2024 Poster"},{"id":"http://arxiv.org/abs/2410.21647v2","updated":"2024-10-31T07:31:31Z","published":"2024-10-29T01:21:05Z","title":"Can Language Models Replace Programmers? REPOCOD Says 'Not Yet'","summary":" Large language models (LLMs) have achieved high accuracy, i.e., more than 90\npass@1, in solving Python coding problems in HumanEval and MBPP. Thus, a\nnatural question is, whether LLMs achieve comparable code completion\nperformance compared to human developers? Unfortunately, one cannot answer this\nquestion using existing manual crafted or simple (e.g., single-line) code\ngeneration benchmarks, since such tasks fail to represent real-world software\ndevelopment tasks. In addition, existing benchmarks often use poor code\ncorrectness metrics, providing misleading conclusions.\n To address these challenges, we create REPOCOD, a code generation benchmark\nwith 980 problems collected from 11 popular real-world projects, with more than\n58% of them requiring file-level or repository-level context information. In\naddition, REPOCOD has the longest average canonical solution length (331.6\ntokens) and the highest average cyclomatic complexity (9.00) compared to\nexisting benchmarks. Each task in REPOCOD includes 313.5 developerwritten test\ncases on average for better correctness evaluation. In our evaluations of ten\nLLMs, none of the models achieve more than 30 pass@1 on REPOCOD, indicating the\nnecessity of building stronger LLMs that can help developers in real-world\nsoftware development. REPOCOD is available at\nhttps://github.com/ltasset/REPOCOD\n","authors":["Shanchao Liang","Yiran Hu","Nan Jiang","Lin Tan"],"pdf_url":"https://arxiv.org/pdf/2410.21647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23692v1","updated":"2024-10-31T07:30:38Z","published":"2024-10-31T07:30:38Z","title":"Instruction-Tuning Llama-3-8B Excels in City-Scale Mobility Prediction","summary":" Human mobility prediction plays a critical role in applications such as\ndisaster response, urban planning, and epidemic forecasting. Traditional\nmethods often rely on designing crafted, domain-specific models, and typically\nfocus on short-term predictions, which struggle to generalize across diverse\nurban environments. In this study, we introduce Llama-3-8B-Mob, a large\nlanguage model fine-tuned with instruction tuning, for long-term citywide\nmobility prediction -- in a Q&A manner. We validate our approach using\nlarge-scale human mobility data from four metropolitan areas in Japan, focusing\non predicting individual trajectories over the next 15 days. The results\ndemonstrate that Llama-3-8B-Mob excels in modeling long-term human mobility --\nsurpassing the state-of-the-art on multiple prediction metrics. It also\ndisplays strong zero-shot generalization capabilities -- effectively\ngeneralizing to other cities even when fine-tuned only on limited samples from\na single city. Source codes are available at\nhttps://github.com/TANGHULU6/Llama3-8B-Mob.\n","authors":["Peizhi Tang","Chuang Yang","Tong Xing","Xiaohang Xu","Renhe Jiang","Kaoru Sezaki"],"pdf_url":"https://arxiv.org/pdf/2410.23692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17617v3","updated":"2024-10-31T07:23:09Z","published":"2023-12-29T14:25:22Z","title":"Large Language Models for Generative Information Extraction: A Survey","summary":" Information extraction (IE) aims to extract structural knowledge from plain\nnatural language texts. Recently, generative Large Language Models (LLMs) have\ndemonstrated remarkable capabilities in text understanding and generation. As a\nresult, numerous works have been proposed to integrate LLMs for IE tasks based\non a generative paradigm. To conduct a comprehensive systematic review and\nexploration of LLM efforts for IE tasks, in this study, we survey the most\nrecent advancements in this field. We first present an extensive overview by\ncategorizing these works in terms of various IE subtasks and techniques, and\nthen we empirically analyze the most advanced methods and discover the emerging\ntrend of IE tasks with LLMs. Based on a thorough review conducted, we identify\nseveral insights in technique and promising research directions that deserve\nfurther exploration in future studies. We maintain a public repository and\nconsistently update related works and resources on GitHub\n(\\href{https://github.com/quqxui/Awesome-LLM4IE-Papers}{LLM4IE repository})\n","authors":["Derong Xu","Wei Chen","Wenjun Peng","Chao Zhang","Tong Xu","Xiangyu Zhao","Xian Wu","Yefeng Zheng","Yang Wang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.17617v3.pdf","comment":"The article has been accepted by Frontiers of Computer Science (FCS),\n with the DOI: {10.1007/s11704-024-40555-y}. You can cite the FCS version"},{"id":"http://arxiv.org/abs/2410.23684v1","updated":"2024-10-31T07:19:44Z","published":"2024-10-31T07:19:44Z","title":"Improbable Bigrams Expose Vulnerabilities of Incomplete Tokens in\n Byte-Level Tokenizers","summary":" Tokenization is a crucial step that bridges human-readable text with\nmodel-readable discrete tokens. However, recent studies have revealed that\ntokenizers can be exploited to elicit unwanted model behaviors. In this work,\nwe investigate incomplete tokens, i.e., undecodable tokens with stray bytes\nresulting from byte-level byte-pair encoding (BPE) tokenization. We hypothesize\nthat such tokens are heavily reliant on their adjacent tokens and are fragile\nwhen paired with unfamiliar tokens. To demonstrate this vulnerability, we\nintroduce improbable bigrams: out-of-distribution combinations of incomplete\ntokens designed to exploit their dependency. Our experiments show that\nimprobable bigrams are significantly prone to hallucinatory behaviors.\nSurprisingly, alternative tokenizations of the same phrases result in\ndrastically lower rates of hallucination (93% reduction in Llama3.1). We\ncaution against the potential vulnerabilities introduced by byte-level BPE\ntokenizers, which may impede the development of trustworthy language models.\n","authors":["Eugene Jang","Kimin Lee","Jin-Woo Chung","Keuntae Park","Seungwon Shin"],"pdf_url":"https://arxiv.org/pdf/2410.23684v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.18406v3","updated":"2024-10-31T23:27:09Z","published":"2024-05-28T17:46:36Z","title":"RACCooN: A Versatile Instructional Video Editing Framework with\n Auto-Generated Narratives","summary":" Recent video generative models primarily rely on carefully written text\nprompts for specific tasks, like inpainting or style editing. They require\nlabor-intensive textual descriptions for input videos, hindering their\nflexibility to adapt personal/raw videos to user specifications. This paper\nproposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video\ngenerative framework that supports multiple video editing capabilities such as\nremoval, addition, and modification, through a unified pipeline. RACCooN\nconsists of two principal stages: Video-to-Paragraph (V2P) and\nParagraph-to-Video (P2V). In the V2P stage, we automatically describe video\nscenes in well-structured natural language, capturing both the holistic context\nand focused object details. Subsequently, in the P2V stage, users can\noptionally refine these descriptions to guide the video diffusion model,\nenabling various modifications to the input video, such as removing, changing\nsubjects, and/or adding new objects. The proposed approach stands out from\nother methods through several significant contributions: (1) RACCooN suggests a\nmulti-granular spatiotemporal pooling strategy to generate well-structured\nvideo descriptions, capturing both the broad context and object details without\nrequiring complex human annotations, simplifying precise video content editing\nbased on text for users. (2) Our video generative model incorporates\nauto-generated narratives or instructions to enhance the quality and accuracy\nof the generated content. (3) RACCooN also plans to imagine new objects in a\ngiven video, so users simply prompt the model to receive a detailed video\nediting plan for complex video editing. The proposed framework demonstrates\nimpressive versatile capabilities in video-to-paragraph generation, video\ncontent editing, and can be incorporated into other SoTA video generative\nmodels for further enhancement.\n","authors":["Jaehong Yoon","Shoubin Yu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2405.18406v3.pdf","comment":"The first two authors contribute equally. Project Page:\n https://raccoon-mllm-gen.github.io/"},{"id":"http://arxiv.org/abs/2407.17453v2","updated":"2024-10-31T23:23:22Z","published":"2024-07-24T17:37:05Z","title":"VILA$^2$: VILA Augmented VILA","summary":" While visual language model architectures and training infrastructures\nadvance rapidly, data curation remains under-explored where quantity and\nquality become a bottleneck. Existing work either crawls extra Internet data\nwith a loose guarantee of quality or distills from black-box proprietary\nmodels, e.g., GPT-4V / Gemini that are API frequency and performance bounded.\nThis work enables a VLM to improve itself via data enhancement, exploiting its\ngenerative nature. We introduce a simple yet effective VLM augmentation scheme\nthat includes a self-augment step and a specialist-augment step to iteratively\nimprove data quality and hence, model performance. In the self-augment step,\nthe instruction-finetuned VLM recaptions its pretraining caption datasets and\nthen retrains from scratch leveraging refined data. Without any expensive\nhuman-in-the-loop annotation, we observe improvements in data quality and\ndownstream accuracy boosts with three self-augmentation rounds -- a viable free\nlunch to the current VLM training recipe. When self-augmentation saturates, we\naugment the caption diversity by leveraging specialty skills picked up from\ninstruction finetuning. We finetune VLM specialists from the self-augmented VLM\nwith domain-specific experts, including spatial, grounding, and OCR, to fuse\ntask-aware synthetic data into the pretraining stage. Data quality improvements\nand hallucination reductions are cross-checked by VLM (GPT-4V, Gemini) and\nhuman judges. Combining self-augmentation and specialist-augmented training,\nVILA$^2$ consistently improves the accuracy on a wide range of benchmarks over\nthe prior art, producing a reusable pretraining dataset that is 300x more\ncost-efficient than human labeling.\n","authors":["Yunhao Fang","Ligeng Zhu","Yao Lu","Yan Wang","Pavlo Molchanov","Jan Kautz","Jang Hyun Cho","Marco Pavone","Song Han","Hongxu Yin"],"pdf_url":"https://arxiv.org/pdf/2407.17453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09614v2","updated":"2024-10-31T21:43:04Z","published":"2024-10-12T18:28:56Z","title":"Exploring Behavior-Relevant and Disentangled Neural Dynamics with\n Generative Diffusion Models","summary":" Understanding the neural basis of behavior is a fundamental goal in\nneuroscience. Current research in large-scale neuro-behavioral data analysis\noften relies on decoding models, which quantify behavioral information in\nneural data but lack details on behavior encoding. This raises an intriguing\nscientific question: ``how can we enable in-depth exploration of neural\nrepresentations in behavioral tasks, revealing interpretable neural dynamics\nassociated with behaviors''. However, addressing this issue is challenging due\nto the varied behavioral encoding across different brain regions and mixed\nselectivity at the population level. To tackle this limitation, our approach,\nnamed ``BeNeDiff'', first identifies a fine-grained and disentangled neural\nsubspace using a behavior-informed latent variable model. It then employs\nstate-of-the-art generative diffusion models to synthesize behavior videos that\ninterpret the neural dynamics of each latent factor. We validate the method on\nmulti-session datasets containing widefield calcium imaging recordings across\nthe dorsal cortex. Through guiding the diffusion model to activate individual\nlatent factors, we verify that the neural dynamics of latent factors in the\ndisentangled neural subspace provide interpretable quantifications of the\nbehaviors of interest. At the same time, the neural subspace in BeNeDiff\ndemonstrates high disentanglement and neural reconstruction quality.\n","authors":["Yule Wang","Chengrui Li","Weihan Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2410.09614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13770v2","updated":"2024-10-31T21:21:26Z","published":"2024-06-19T18:38:11Z","title":"Elliptical Attention","summary":" Pairwise dot-product self-attention is key to the success of transformers\nthat achieve state-of-the-art performance across a variety of applications in\nlanguage and vision. This dot-product self-attention computes attention weights\namong the input tokens using Euclidean distance, which makes the model prone to\nrepresentation collapse and vulnerable to contaminated samples. In this paper,\nwe propose using a Mahalanobis distance metric for computing the attention\nweights to stretch the underlying feature space in directions of high\ncontextual relevance. In particular, we define a hyper-ellipsoidal neighborhood\naround each query to increase the attention weights of the tokens lying in the\ncontextually important directions. We term this novel class of attention\nElliptical Attention. Our Elliptical Attention provides two benefits: 1)\nreducing representation collapse and 2) enhancing the model's robustness as\nElliptical Attention pays more attention to contextually relevant information\nrather than focusing on some small subset of informative features. We\nempirically demonstrate the advantages of Elliptical Attention over the\nbaseline dot-product attention and state-of-the-art attention methods on\nvarious practical tasks, including object classification, image segmentation,\nand language modeling across different data modalities.\n","authors":["Stefan K. Nielsen","Laziz U. Abdullaev","Rachel S. Y. Teo","Tan M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.13770v2.pdf","comment":"10 pages in the main text. Published at NeurIPS 2024. The code is\n available at https://github.com/stefvk/Elliptical-Attention"},{"id":"http://arxiv.org/abs/2410.23109v2","updated":"2024-10-31T20:48:34Z","published":"2024-10-30T15:20:10Z","title":"NASM: Neural Anisotropic Surface Meshing","summary":" This paper introduces a new learning-based method, NASM, for anisotropic\nsurface meshing. Our key idea is to propose a graph neural network to embed an\ninput mesh into a high-dimensional (high-d) Euclidean embedding space to\npreserve curvature-based anisotropic metric by using a dot product loss between\nhigh-d edge vectors. This can dramatically reduce the computational time and\nincrease the scalability. Then, we propose a novel feature-sensitive remeshing\non the generated high-d embedding to automatically capture sharp geometric\nfeatures. We define a high-d normal metric, and then derive an automatic\ndifferentiation on a high-d centroidal Voronoi tessellation (CVT) optimization\nwith the normal metric to simultaneously preserve geometric features and\ncurvature anisotropy that exhibit in the original 3D shapes. To our knowledge,\nthis is the first time that a deep learning framework and a large dataset are\nproposed to construct a high-d Euclidean embedding space for 3D anisotropic\nsurface meshing. Experimental results are evaluated and compared with the\nstate-of-the-art in anisotropic surface meshing on a large number of surface\nmodels from Thingi10K dataset as well as tested on extensive unseen 3D shapes\nfrom Multi-Garment Network dataset and FAUST human dataset.\n","authors":["Hongbo Li","Haikuan Zhu","Sikai Zhong","Ningna Wang","Cheng Lin","Xiaohu Guo","Shiqing Xin","Wenping Wang","Jing Hua","Zichun Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23109v2.pdf","comment":"SIGGRAPH Asia 2024 (Conference Track)"},{"id":"http://arxiv.org/abs/2410.17514v3","updated":"2024-10-31T20:15:43Z","published":"2024-10-23T02:38:12Z","title":"SRA: A Novel Method to Improve Feature Embedding in Self-supervised\n Learning for Histopathological Images","summary":" Self-supervised learning has become a cornerstone in various areas,\nparticularly histopathological image analysis. Image augmentation plays a\ncrucial role in self-supervised learning, as it generates variations in image\nsamples. However, traditional image augmentation techniques often overlook the\nunique characteristics of histopathological images. In this paper, we propose a\nnew histopathology-specific image augmentation method called stain\nreconstruction augmentation (SRA). We integrate our SRA with MoCo v3, a leading\nmodel in self-supervised contrastive learning, along with our additional\ncontrastive loss terms, and call the new model SRA-MoCo v3. We demonstrate that\nour SRA-MoCo v3 always outperforms the standard MoCo v3 across various\ndownstream tasks and achieves comparable or superior performance to other\nfoundation models pre-trained on significantly larger histopathology datasets.\n","authors":["Hamid Manoochehri","Bodong Zhang","Beatrice S. Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2410.17514v3.pdf","comment":"Hamid Manoochehri and Bodong Zhang contributed equally to this work"},{"id":"http://arxiv.org/abs/2405.17537v2","updated":"2024-10-31T20:07:53Z","published":"2024-05-27T17:57:48Z","title":"CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale","summary":" Measuring biodiversity is crucial for understanding ecosystem health. While\nprior works have developed machine learning models for taxonomic classification\nof photographic images and DNA separately, in this work, we introduce a\nmultimodal approach combining both, using CLIP-style contrastive learning to\nalign images, barcode DNA, and text-based representations of taxonomic labels\nin a unified embedding space. This allows for accurate classification of both\nknown and unknown insect species without task-specific fine-tuning, leveraging\ncontrastive learning for the first time to fuse DNA and image data. Our method\nsurpasses previous single-modality approaches in accuracy by over 8% on\nzero-shot learning tasks, showcasing its effectiveness in biodiversity studies.\n","authors":["ZeMing Gong","Austin T. Wang","Xiaoliang Huo","Joakim Bruslund Haurum","Scott C. Lowe","Graham W. Taylor","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.17537v2.pdf","comment":"25 pages with 11 figures"},{"id":"http://arxiv.org/abs/2410.21302v2","updated":"2024-10-31T19:44:26Z","published":"2024-10-21T22:52:25Z","title":"Domain-Adaptive Pre-training of Self-Supervised Foundation Models for\n Medical Image Classification in Gastrointestinal Endoscopy","summary":" Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE)\ndiagnostics by offering a non-invasive method for capturing detailed images of\nthe gastrointestinal tract, enabling early disease detection. However, its\npotential is limited by the sheer volume of images generated during the imaging\nprocedure, which can take anywhere from 6-8 hours and often produce up to 1\nmillion images, necessitating automated analysis. Additionally, the variability\nof these images, combined with the need for expert annotations and the scarcity\nof large, high-quality labeled datasets, constrains the effectiveness of\ncurrent medical image analysis models. To address this, we introduce a novel\nlarge gastrointestinal endoscopy dataset, called EndoExtend24, created by\nmerging and re-stratifying the train/test splits of ten existing public and\nprivate datasets, ensuring no overlap of patient data across splits.\nEndoExtend24 includes over 226,000 labeled images, as well as dynamic class\nmappings, which allow unified training across datasets with differing labeling\ngranularity, supporting up to 123 distinct pathological findings. Further, we\npropose to leverage domain adaptive pre-training of foundation models in\ncomputer vision trained with self-supervision on generic image data, to adapt\nthem to the task of GIE medical diagnosis. Specifically, the EVA-02 model,\nwhich is based on the vision transformer architecture and was trained on\nImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is\npre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and\nfinally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental\nresults demonstrate strong performance with an F1 score of 0.88, an improvement\nof about 39% over the baseline model's F1 score of 0.49. Additionally, the\nmodel achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%.\n","authors":["Marcel Roth","Micha V. Nowak","Adrian Krenzer","Frank Puppe"],"pdf_url":"https://arxiv.org/pdf/2410.21302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18923v2","updated":"2024-10-31T19:44:05Z","published":"2024-10-24T17:11:52Z","title":"SegLLM: Multi-round Reasoning Segmentation","summary":" We present SegLLM, a novel multi-round interactive reasoning segmentation\nmodel that enhances LLM-based segmentation by exploiting conversational memory\nof both visual and textual outputs. By leveraging a mask-aware multimodal LLM,\nSegLLM re-integrates previous segmentation results into its input stream,\nenabling it to reason about complex user intentions and segment objects in\nrelation to previously identified entities, including positional,\ninteractional, and hierarchical relationships, across multiple interactions.\nThis capability allows SegLLM to respond to visual and text queries in a\nchat-like manner. Evaluated on the newly curated MRSeg benchmark, SegLLM\noutperforms existing methods in multi-round interactive reasoning segmentation\nby over 20%. Additionally, we observed that training on multi-round reasoning\nsegmentation data enhances performance on standard single-round referring\nsegmentation and localization tasks, resulting in a 5.5% increase in cIoU for\nreferring expression segmentation and a 4.5% improvement in Acc@0.5 for\nreferring expression localization.\n","authors":["XuDong Wang","Shaolun Zhang","Shufan Li","Konstantinos Kallidromitis","Kehan Li","Yusuke Kato","Kazuki Kozuka","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2410.18923v2.pdf","comment":"22 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2409.09313v2","updated":"2024-10-31T19:36:24Z","published":"2024-09-14T05:17:04Z","title":"Tensor-Based Synchronization and the Low-Rankness of the Block Trifocal\n Tensor","summary":" The block tensor of trifocal tensors provides crucial geometric information\non the three-view geometry of a scene. The underlying synchronization problem\nseeks to recover camera poses (locations and orientations up to a global\ntransformation) from the block trifocal tensor. We establish an explicit Tucker\nfactorization of this tensor, revealing a low multilinear rank of $(6,4,4)$\nindependent of the number of cameras under appropriate scaling conditions. We\nprove that this rank constraint provides sufficient information for camera\nrecovery in the noiseless case. The constraint motivates a synchronization\nalgorithm based on the higher-order singular value decomposition of the block\ntrifocal tensor. Experimental comparisons with state-of-the-art global\nsynchronization methods on real datasets demonstrate the potential of this\nalgorithm for significantly improving location estimation accuracy. Overall\nthis work suggests that higher-order interactions in synchronization problems\ncan be exploited to improve performance, beyond the usual pairwise-based\napproaches.\n","authors":["Daniel Miao","Gilad Lerman","Joe Kileel"],"pdf_url":"https://arxiv.org/pdf/2409.09313v2.pdf","comment":"33 pages, 3 figures. Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.12649v3","updated":"2024-10-31T19:30:46Z","published":"2024-06-18T14:17:57Z","title":"Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations\n for Vision Foundation Models","summary":" Vision transformers (ViTs) have emerged as a significant area of focus,\nparticularly for their capacity to be jointly trained with large language\nmodels and to serve as robust vision foundation models. Yet, the development of\ntrustworthy explanation methods for ViTs has lagged, particularly in the\ncontext of post-hoc interpretations of ViT predictions. Existing sub-image\nselection approaches, such as feature-attribution and conceptual models, fall\nshort in this regard. This paper proposes five desiderata for explaining ViTs\n-- faithfulness, stability, sparsity, multi-level structure, and parsimony --\nand demonstrates the inadequacy of current methods in meeting these criteria\ncomprehensively. We introduce a variational Bayesian explanation framework,\ndubbed ProbAbilistic Concept Explainers (PACE), which models the distributions\nof patch embeddings to provide trustworthy post-hoc conceptual explanations.\nOur qualitative analysis reveals the distributions of patch-level concepts,\nelucidating the effectiveness of ViTs by modeling the joint distribution of\npatch embeddings and ViT's predictions. Moreover, these patch-level\nexplanations bridge the gap between image-level and dataset-level explanations,\nthus completing the multi-level structure of PACE. Through extensive\nexperiments on both synthetic and real-world datasets, we demonstrate that PACE\nsurpasses state-of-the-art methods in terms of the defined desiderata.\n","authors":["Hengyi Wang","Shiwei Tan","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12649v3.pdf","comment":"Proceedings of the 41st International Conference on Machine Learning\n (ICML 2024)"},{"id":"http://arxiv.org/abs/2410.22530v2","updated":"2024-10-31T19:25:40Z","published":"2024-10-29T20:53:01Z","title":"Adaptive Aggregation Weights for Federated Segmentation of Pancreas MRI","summary":" Federated learning (FL) enables collaborative model training across\ninstitutions without sharing sensitive data, making it an attractive solution\nfor medical imaging tasks. However, traditional FL methods, such as Federated\nAveraging (FedAvg), face difficulties in generalizing across domains due to\nvariations in imaging protocols and patient demographics across institutions.\nThis challenge is particularly evident in pancreas MRI segmentation, where\nanatomical variability and imaging artifacts significantly impact performance.\nIn this paper, we conduct a comprehensive evaluation of FL algorithms for\npancreas MRI segmentation and introduce a novel approach that incorporates\nadaptive aggregation weights. By dynamically adjusting the contribution of each\nclient during model aggregation, our method accounts for domain-specific\ndifferences and improves generalization across heterogeneous datasets.\nExperimental results demonstrate that our approach enhances segmentation\naccuracy and reduces the impact of domain shift compared to conventional FL\nmethods while maintaining privacy-preserving capabilities. Significant\nperformance improvements are observed across multiple hospitals (centers).\n","authors":["Hongyi Pan","Gorkem Durak","Zheyuan Zhang","Yavuz Taktak","Elif Keles","Halil Ertugrul Aktas","Alpay Medetalibeyoglu","Yury Velichko","Concetto Spampinato","Ivo Schoots","Marco J. Bruno","Rajesh N. Keswani","Pallavi Tiwari","Candice Bolan","Tamas Gonda","Michael G. Goggins","Michael B. Wallace","Ziyue Xu","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2410.22530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14979v3","updated":"2024-10-31T19:08:00Z","published":"2024-07-20T21:06:33Z","title":"RGB2Point: 3D Point Cloud Generation from Single RGB Images","summary":" We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud\ngeneration based on Transformer. RGB2Point takes an input image of an object\nand generates a dense 3D point cloud. Contrary to prior works based on CNN\nlayers and diffusion denoising approaches, we use pre-trained Transformer\nlayers that are fast and generate high-quality point clouds with consistent\nquality over available categories. Our generated point clouds demonstrate high\nquality on a real-world dataset, as evidenced by improved Chamfer distance\n(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current\nstate-of-the-art. Additionally, our approach shows a better quality on a\nsynthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's\ndistance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1%\nmore consistent high-quality results across various object categories compared\nto prior works. Furthermore, RGB2Point is computationally efficient, requiring\nonly 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and\nour implementation generates the results 15,133x faster than a SOTA\ndiffusion-based model.\n","authors":["Jae Joong Lee","Bedrich Benes"],"pdf_url":"https://arxiv.org/pdf/2407.14979v3.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2410.23191v2","updated":"2024-10-31T18:19:02Z","published":"2024-10-30T16:45:59Z","title":"Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI\n Segmentation","summary":" Current cardiac cine magnetic resonance image (cMR) studies focus on the end\ndiastole (ED) and end systole (ES) phases, while ignoring the abundant temporal\ninformation in the whole image sequence. This is because whole sequence\nsegmentation is currently a tedious process and inaccurate. Conventional whole\nsequence segmentation approaches first estimate the motion field between\nframes, which is then used to propagate the mask along the temporal axis.\nHowever, the mask propagation results could be prone to error, especially for\nthe basal and apex slices, where through-plane motion leads to significant\nmorphology and structural change during the cardiac cycle. Inspired by recent\nadvances in video object segmentation (VOS), based on spatio-temporal memory\n(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised\nwhole heart and whole sequence cMR segmentation. Our CSTM network takes full\nadvantage of the spatial, scale, temporal and through-plane continuity prior of\nthe underlying heart anatomy structures, to achieve accurate and fast 4D\nsegmentation. Results of extensive experiments across multiple cMR datasets\nshow that our method can improve the 4D cMR segmentation performance,\nespecially for the hard-to-segment regions.\n","authors":["Meng Ye","Bingyu Xin","Leon Axel","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2410.23191v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2407.06192v2","updated":"2024-10-31T18:16:38Z","published":"2024-07-08T17:59:57Z","title":"Multi-Object Hallucination in Vision-Language Models","summary":" Large vision language models (LVLMs) often suffer from object hallucination,\nproducing objects not present in the given images. While current benchmarks for\nobject hallucination primarily concentrate on the presence of a single object\nclass rather than individual entities, this work systematically investigates\nmulti-object hallucination, examining how models misperceive (e.g., invent\nnonexistent objects or become distracted) when tasked with focusing on multiple\nobjects simultaneously. We introduce Recognition-based Object Probing\nEvaluation (ROPE), an automated evaluation protocol that considers the\ndistribution of object classes within a single image during testing and uses\nvisual referring prompts to eliminate ambiguity. With comprehensive empirical\nstudies and analysis of potential factors leading to multi-object\nhallucination, we found that (1). LVLMs suffer more hallucinations when\nfocusing on multiple objects compared to a single object. (2). The tested\nobject class distribution affects hallucination behaviors, indicating that\nLVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory\nbehaviors are influenced by data-specific factors, salience and frequency, and\nmodel intrinsic behaviors. We hope to enable LVLMs to recognize and reason\nabout multiple objects that often occur in realistic visual scenes, provide\ninsights, and quantify our progress towards mitigating the issues.\n","authors":["Xuweiyi Chen","Ziqiao Ma","Xuejun Zhang","Sihan Xu","Shengyi Qian","Jianing Yang","David F. Fouhey","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2407.06192v2.pdf","comment":"Accepted to NeurIPS 2024 | Project page:\n https://multi-object-hallucination.github.io/"},{"id":"http://arxiv.org/abs/2410.07801v3","updated":"2024-10-31T18:06:49Z","published":"2024-10-10T10:40:42Z","title":"LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory\n Equipment with Different Degrees of Transparency via 6D Pose Estimation","summary":" Many modern robotic systems operate autonomously, however they often lack the\nability to accurately analyze the environment and adapt to changing external\nconditions, while teleoperation systems often require special operator skills.\nIn the field of laboratory automation, the number of automated processes is\ngrowing, however such systems are usually developed to perform specific tasks.\nIn addition, many of the objects used in this field are transparent, making it\ndifficult to analyze them using visual channels. The contributions of this work\ninclude the development of a robotic framework with autonomous mode for\nmanipulating liquid-filled objects with different degrees of transparency in\ncomplex pose combinations. The conducted experiments demonstrated the\nrobustness of the designed visual perception system to accurately estimate\nobject poses for autonomous manipulation, and confirmed the performance of the\nalgorithms in dexterous operations such as liquid dispensing. The proposed\nrobotic framework can be applied for laboratory automation, since it allows\nsolving the problem of performing non-trivial manipulation tasks with the\nanalysis of object poses of varying degrees of transparency and liquid levels,\nrequiring high accuracy and repeatability.\n","authors":["Maria Makarova","Daria Trinitatova","Qian Liu","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2410.07801v3.pdf","comment":"Accepted to the 2024 IEEE International Conference on Robotics and\n Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23277v2","updated":"2024-10-31T18:03:51Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Linjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24223v1","updated":"2024-10-31T17:59:56Z","published":"2024-10-31T17:59:56Z","title":"URAvatar: Universal Relightable Gaussian Codec Avatars","summary":" We present a new approach to creating photorealistic and relightable head\navatars from a phone scan with unknown illumination. The reconstructed avatars\ncan be animated and relit in real time with the global illumination of diverse\nenvironments. Unlike existing approaches that estimate parametric reflectance\nparameters via inverse rendering, our approach directly models learnable\nradiance transfer that incorporates global light transport in an efficient\nmanner for real-time rendering. However, learning such a complex light\ntransport that can generalize across identities is non-trivial. A phone scan in\na single environment lacks sufficient information to infer how the head would\nappear in general environments. To address this, we build a universal\nrelightable avatar model represented by 3D Gaussians. We train on hundreds of\nhigh-quality multi-view human scans with controllable point lights.\nHigh-resolution geometric guidance further enhances the reconstruction accuracy\nand generalization. Once trained, we finetune the pretrained model on a phone\nscan using inverse rendering to obtain a personalized relightable avatar. Our\nexperiments establish the efficacy of our design, outperforming existing\napproaches while retaining real-time rendering capability.\n","authors":["Junxuan Li","Chen Cao","Gabriel Schwartz","Rawal Khirodkar","Christian Richardt","Tomas Simon","Yaser Sheikh","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2410.24223v1.pdf","comment":"SIGGRAPH Asia 2024. Website:\n https://junxuan-li.github.io/urgca-website/"},{"id":"http://arxiv.org/abs/2410.24221v1","updated":"2024-10-31T17:59:55Z","published":"2024-10-31T17:59:55Z","title":"EgoMimic: Scaling Imitation Learning via Egocentric Video","summary":" The scale and diversity of demonstration data required for imitation learning\nis a significant challenge. We present EgoMimic, a full-stack framework which\nscales manipulation via human embodiment data, specifically egocentric human\nvideos paired with 3D hand tracking. EgoMimic achieves this through: (1) a\nsystem to capture human embodiment data using the ergonomic Project Aria\nglasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap\nto human data, (3) cross-domain data alignment techniques, and (4) an imitation\nlearning architecture that co-trains on human and robot data. Compared to prior\nworks that only extract high-level intent from human videos, our approach\ntreats human and robot data equally as embodied demonstration data and learns a\nunified policy from both data sources. EgoMimic achieves significant\nimprovement on a diverse set of long-horizon, single-arm and bimanual\nmanipulation tasks over state-of-the-art imitation learning methods and enables\ngeneralization to entirely new scenes. Finally, we show a favorable scaling\ntrend for EgoMimic, where adding 1 hour of additional hand data is\nsignificantly more valuable than 1 hour of additional robot data. Videos and\nadditional information can be found at https://egomimic.github.io/\n","authors":["Simar Kareer","Dhruv Patel","Ryan Punamiya","Pranay Mathur","Shuo Cheng","Chen Wang","Judy Hoffman","Danfei Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24219v1","updated":"2024-10-31T17:59:53Z","published":"2024-10-31T17:59:53Z","title":"Enhancing Motion in Text-to-Video Generation with Decomposed Encoding\n and Conditioning","summary":" Despite advancements in Text-to-Video (T2V) generation, producing videos with\nrealistic motion remains challenging. Current models often yield static or\nminimally dynamic outputs, failing to capture complex motions described by\ntext. This issue stems from the internal biases in text encoding, which\noverlooks motions, and inadequate conditioning mechanisms in T2V generation\nmodels. To address this, we propose a novel framework called DEcomposed MOtion\n(DEMO), which enhances motion synthesis in T2V generation by decomposing both\ntext encoding and conditioning into content and motion components. Our method\nincludes a content encoder for static elements and a motion encoder for\ntemporal dynamics, alongside separate content and motion conditioning\nmechanisms. Crucially, we introduce text-motion and video-motion supervision to\nimprove the model's understanding and generation of motion. Evaluations on\nbenchmarks such as MSR-VTT, UCF-101, WebVid-10M, EvalCrafter, and VBench\ndemonstrate DEMO's superior ability to produce videos with enhanced motion\ndynamics while maintaining high visual quality. Our approach significantly\nadvances T2V generation by integrating comprehensive motion understanding\ndirectly from textual descriptions. Project page:\nhttps://PR-Ryan.github.io/DEMO-project/\n","authors":["Penghui Ruan","Pichao Wang","Divya Saxena","Jiannong Cao","Yuhui Shi"],"pdf_url":"https://arxiv.org/pdf/2410.24219v1.pdf","comment":"Accepted at NeurIPS 2024, code available at\n https://github.com/PR-Ryan/DEMO"},{"id":"http://arxiv.org/abs/2410.24218v1","updated":"2024-10-31T17:59:52Z","published":"2024-10-31T17:59:52Z","title":"Teaching Embodied Reinforcement Learning Agents: Informativeness and\n Diversity of Language Use","summary":" In real-world scenarios, it is desirable for embodied agents to have the\nability to leverage human language to gain explicit or implicit knowledge for\nlearning tasks. Despite recent progress, most previous approaches adopt simple\nlow-level instructions as language inputs, which may not reflect natural human\ncommunication. It's not clear how to incorporate rich language use to\nfacilitate task learning. To address this question, this paper studies\ndifferent types of language inputs in facilitating reinforcement learning (RL)\nembodied agents. More specifically, we examine how different levels of language\ninformativeness (i.e., feedback on past behaviors and future guidance) and\ndiversity (i.e., variation of language expressions) impact agent learning and\ninference. Our empirical results based on four RL benchmarks demonstrate that\nagents trained with diverse and informative language feedback can achieve\nenhanced generalization and fast adaptation to new tasks. These findings\nhighlight the pivotal role of language use in teaching embodied agents new\ntasks in an open world. Project website:\nhttps://github.com/sled-group/Teachable_RL\n","authors":["Jiajun Xi","Yinong He","Jianing Yang","Yinpei Dai","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2410.24218v1.pdf","comment":"EMNLP 2024 Main. Project website:\n https://github.com/sled-group/Teachable_RL"},{"id":"http://arxiv.org/abs/2410.24214v1","updated":"2024-10-31T17:59:37Z","published":"2024-10-31T17:59:37Z","title":"ARQ: A Mixed-Precision Quantization Framework for Accurate and\n Certifiably Robust DNNs","summary":" Mixed precision quantization has become an important technique for enabling\nthe execution of deep neural networks (DNNs) on limited resource computing\nplatforms. Traditional quantization methods have primarily concentrated on\nmaintaining neural network accuracy, either ignoring the impact of quantization\non the robustness of the network, or using only empirical techniques for\nimproving robustness. In contrast, techniques for robustness certification,\nwhich can provide strong guarantees about the robustness of DNNs have not been\nused during quantization due to their high computation cost.\n This paper introduces ARQ, an innovative mixed-precision quantization method\nthat not only preserves the clean accuracy of the smoothed classifiers but also\nmaintains their certified robustness. ARQ uses reinforcement learning to find\naccurate and robust DNN quantization, while efficiently leveraging randomized\nsmoothing, a popular class of statistical DNN verification algorithms, to guide\nthe search process.\n We compare ARQ with multiple state-of-the-art quantization techniques on\nseveral DNN architectures commonly used in quantization studies: ResNet-20 on\nCIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate\nthat ARQ consistently performs better than these baselines across all the\nbenchmarks and the input perturbation levels. In many cases, the performance of\nARQ quantized networks can reach that of the original DNN with floating-point\nweights, but with only 1.5% instructions.\n","authors":["Yuchen Yang","Shubham Ugare","Yifan Zhao","Gagandeep Singh","Sasa Misailovic"],"pdf_url":"https://arxiv.org/pdf/2410.24214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24213v1","updated":"2024-10-31T17:59:30Z","published":"2024-10-31T17:59:30Z","title":"Learning Video Representations without Natural Videos","summary":" In this paper, we show that useful video representations can be learned from\nsynthetic videos and natural images, without incorporating natural videos in\nthe training. We propose a progression of video datasets synthesized by simple\ngenerative processes, that model a growing set of natural video properties\n(e.g. motion, acceleration, and shape transformations). The downstream\nperformance of video models pre-trained on these generated datasets gradually\nincreases with the dataset progression. A VideoMAE model pre-trained on our\nsynthetic videos closes 97.2% of the performance gap on UCF101 action\nclassification between training from scratch and self-supervised pre-training\nfrom natural videos, and outperforms the pre-trained model on HMDB51.\nIntroducing crops of static images to the pre-training stage results in similar\nperformance to UCF101 pre-training and outperforms the UCF101 pre-trained model\non 11 out of 14 out-of-distribution datasets of UCF101-P. Analyzing the\nlow-level properties of the datasets, we identify correlations between frame\ndiversity, frame similarity to natural data, and downstream performance. Our\napproach provides a more controllable and transparent alternative to video data\ncuration processes for pre-training.\n","authors":["Xueyang Yu","Xinlei Chen","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2410.24213v1.pdf","comment":"Project page: https://unicorn53547.github.io/video_syn_rep/"},{"id":"http://arxiv.org/abs/2406.15349v2","updated":"2024-10-31T17:58:34Z","published":"2024-06-21T17:59:02Z","title":"NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and\n Benchmarking","summary":" Benchmarking vision-based driving policies is challenging. On one hand,\nopen-loop evaluation with real data is easy, but these results do not reflect\nclosed-loop performance. On the other, closed-loop evaluation is possible in\nsimulation, but is hard to scale due to its significant computational demands.\nFurther, the simulators available today exhibit a large domain gap to real\ndata. This has resulted in an inability to draw clear conclusions from the\nrapidly growing body of research on end-to-end autonomous driving. In this\npaper, we present NAVSIM, a middle ground between these evaluation paradigms,\nwhere we use large datasets in combination with a non-reactive simulator to\nenable large-scale real-world benchmarking. Specifically, we gather\nsimulation-based metrics, such as progress and time to collision, by unrolling\nbird's eye view abstractions of the test scenes for a short simulation horizon.\nOur simulation is non-reactive, i.e., the evaluated policy and environment do\nnot influence each other. As we demonstrate empirically, this decoupling allows\nopen-loop metric computation while being better aligned with closed-loop\nevaluations than traditional displacement errors. NAVSIM enabled a new\ncompetition held at CVPR 2024, where 143 teams submitted 463 entries, resulting\nin several new insights. On a large set of challenging scenarios, we observe\nthat simple methods with moderate compute requirements such as TransFuser can\nmatch recent large-scale end-to-end driving architectures such as UniAD. Our\nmodular framework can potentially be extended with new datasets, data curation\nstrategies, and metrics, and will be continually maintained to host future\nchallenges. Our code is available at\nhttps://github.com/autonomousvision/navsim.\n","authors":["Daniel Dauner","Marcel Hallgarten","Tianyu Li","Xinshuo Weng","Zhiyu Huang","Zetong Yang","Hongyang Li","Igor Gilitschenski","Boris Ivanovic","Marco Pavone","Andreas Geiger","Kashyap Chitta"],"pdf_url":"https://arxiv.org/pdf/2406.15349v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2410.24207v1","updated":"2024-10-31T17:58:22Z","published":"2024-10-31T17:58:22Z","title":"No Pose, No Problem: Surprisingly Simple 3D Gaussian Splats from Sparse\n Unposed Images","summary":" We introduce NoPoSplat, a feed-forward model capable of reconstructing 3D\nscenes parameterized by 3D Gaussians from \\textit{unposed} sparse multi-view\nimages. Our model, trained exclusively with photometric loss, achieves\nreal-time 3D Gaussian reconstruction during inference. To eliminate the need\nfor accurate pose input during reconstruction, we anchor one input view's local\ncamera coordinates as the canonical space and train the network to predict\nGaussian primitives for all views within this space. This approach obviates the\nneed to transform Gaussian primitives from local coordinates into a global\ncoordinate system, thus avoiding errors associated with per-frame Gaussians and\npose estimation. To resolve scale ambiguity, we design and compare various\nintrinsic embedding methods, ultimately opting to convert camera intrinsics\ninto a token embedding and concatenate it with image tokens as input to the\nmodel, enabling accurate scene scale prediction. We utilize the reconstructed\n3D Gaussians for novel view synthesis and pose estimation tasks and propose a\ntwo-stage coarse-to-fine pipeline for accurate pose estimation. Experimental\nresults demonstrate that our pose-free approach can achieve superior novel view\nsynthesis quality compared to pose-required methods, particularly in scenarios\nwith limited input image overlap. For pose estimation, our method, trained\nwithout ground truth depth or explicit matching loss, significantly outperforms\nthe state-of-the-art methods with substantial improvements. This work makes\nsignificant advances in pose-free generalizable 3D reconstruction and\ndemonstrates its applicability to real-world scenarios. Code and trained models\nare available at https://noposplat.github.io/.\n","authors":["Botao Ye","Sifei Liu","Haofei Xu","Xueting Li","Marc Pollefeys","Ming-Hsuan Yang","Songyou Peng"],"pdf_url":"https://arxiv.org/pdf/2410.24207v1.pdf","comment":"Project page: https://noposplat.github.io/"},{"id":"http://arxiv.org/abs/2410.24203v1","updated":"2024-10-31T17:57:02Z","published":"2024-10-31T17:57:02Z","title":"DiffPano: Scalable and Consistent Text to Panorama Generation with\n Spherical Epipolar-Aware Diffusion","summary":" Diffusion-based methods have achieved remarkable achievements in 2D image or\n3D object generation, however, the generation of 3D scenes and even\n$360^{\\circ}$ images remains constrained, due to the limited number of scene\ndatasets, the complexity of 3D scenes themselves, and the difficulty of\ngenerating consistent multi-view images. To address these issues, we first\nestablish a large-scale panoramic video-text dataset containing millions of\nconsecutive panoramic keyframes with corresponding panoramic depths, camera\nposes, and text descriptions. Then, we propose a novel text-driven panoramic\ngeneration framework, termed DiffPano, to achieve scalable, consistent, and\ndiverse panoramic scene generation. Specifically, benefiting from the powerful\ngenerative capabilities of stable diffusion, we fine-tune a single-view\ntext-to-panorama diffusion model with LoRA on the established panoramic\nvideo-text dataset. We further design a spherical epipolar-aware multi-view\ndiffusion model to ensure the multi-view consistency of the generated panoramic\nimages. Extensive experiments demonstrate that DiffPano can generate scalable,\nconsistent, and diverse panoramic images with given unseen text descriptions\nand camera poses.\n","authors":["Weicai Ye","Chenhao Ji","Zheng Chen","Junyao Gao","Xiaoshui Huang","Song-Hai Zhang","Wanli Ouyang","Tong He","Cairong Zhao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24203v1.pdf","comment":"NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code:\n https://github.com/zju3dv/DiffPano"},{"id":"http://arxiv.org/abs/2410.24187v1","updated":"2024-10-31T17:49:44Z","published":"2024-10-31T17:49:44Z","title":"Chasing Better Deep Image Priors between Over- and\n Under-parameterization","summary":" Deep Neural Networks (DNNs) are well-known to act as over-parameterized deep\nimage priors (DIP) that regularize various image inverse problems. Meanwhile,\nresearchers also proposed extremely compact, under-parameterized image priors\n(e.g., deep decoder) that are strikingly competent for image restoration too,\ndespite a loss of accuracy. These two extremes push us to think whether there\nexists a better solution in the middle: between over- and under-parameterized\nimage priors, can one identify \"intermediate\" parameterized image priors that\nachieve better trade-offs between performance, efficiency, and even preserving\nstrong transferability? Drawing inspirations from the lottery ticket hypothesis\n(LTH), we conjecture and study a novel \"lottery image prior\" (LIP) by\nexploiting DNN inherent sparsity, stated as: given an over-parameterized\nDNN-based image prior, it will contain a sparse subnetwork that can be trained\nin isolation, to match the original DNN's performance when being applied as a\nprior to various image inverse problems. Our results validate the superiority\nof LIPs: we can successfully locate the LIP subnetworks from over-parameterized\nDIPs at substantial sparsity ranges. Those LIP subnetworks significantly\noutperform deep decoders under comparably compact model sizes (by often fully\npreserving the effectiveness of their over-parameterized counterparts), and\nthey also possess high transferability across different images as well as\nrestoration task types. Besides, we also extend LIP to compressive sensing\nimage reconstruction, where a pre-trained GAN generator is used as the prior\n(in contrast to untrained DIP or deep decoder), and confirm its validity in\nthis setting too. To our best knowledge, this is the first time that LTH is\ndemonstrated to be relevant in the context of inverse problems or image priors.\n","authors":["Qiming Wu","Xiaohan Chen","Yifan Jiang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24187v1.pdf","comment":"Codes are available at\n https://github.com/VITA-Group/Chasing-Better-DIPs"},{"id":"http://arxiv.org/abs/2410.24185v1","updated":"2024-10-31T17:48:45Z","published":"2024-10-31T17:48:45Z","title":"DexMimicGen: Automated Data Generation for Bimanual Dexterous\n Manipulation via Imitation Learning","summary":" Imitation learning from human demonstrations is an effective means to teach\nrobots manipulation skills. But data acquisition is a major bottleneck in\napplying this paradigm more broadly, due to the amount of cost and human effort\ninvolved. There has been significant interest in imitation learning for\nbimanual dexterous robots, like humanoids. Unfortunately, data collection is\neven more challenging here due to the challenges of simultaneously controlling\nmultiple arms and multi-fingered hands. Automated data generation in simulation\nis a compelling, scalable alternative to fuel this need for data. To this end,\nwe introduce DexMimicGen, a large-scale automated data generation system that\nsynthesizes trajectories from a handful of human demonstrations for humanoid\nrobots with dexterous hands. We present a collection of simulation environments\nin the setting of bimanual dexterous manipulation, spanning a range of\nmanipulation behaviors and different requirements for coordination among the\ntwo arms. We generate 21K demos across these tasks from just 60 source human\ndemos and study the effect of several data generation and policy learning\ndecisions on agent performance. Finally, we present a real-to-sim-to-real\npipeline and deploy it on a real-world humanoid can sorting task. Videos and\nmore are at https://dexmimicgen.github.io/\n","authors":["Zhenyu Jiang","Yuqi Xie","Kevin Lin","Zhenjia Xu","Weikang Wan","Ajay Mandlekar","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.24185v1.pdf","comment":"Project website: https://dexmimicgen.github.io/"},{"id":"http://arxiv.org/abs/2409.06711v2","updated":"2024-10-31T17:48:06Z","published":"2024-08-25T13:14:59Z","title":"Quantized neural network for complex hologram generation","summary":" Computer-generated holography (CGH) is a promising technology for augmented\nreality displays, such as head-mounted or head-up displays. However, its high\ncomputational demand makes it impractical for implementation. Recent efforts to\nintegrate neural networks into CGH have successfully accelerated computing\nspeed, demonstrating the potential to overcome the trade-off between\ncomputational cost and image quality. Nevertheless, deploying neural\nnetwork-based CGH algorithms on computationally limited embedded systems\nrequires more efficient models with lower computational cost, memory footprint,\nand power consumption. In this study, we developed a lightweight model for\ncomplex hologram generation by introducing neural network quantization.\nSpecifically, we built a model based on tensor holography and quantized it from\n32-bit floating-point precision (FP32) to 8-bit integer precision (INT8). Our\nperformance evaluation shows that the proposed INT8 model achieves hologram\nquality comparable to that of the FP32 model while reducing the model size by\napproximately 70% and increasing the speed fourfold. Additionally, we\nimplemented the INT8 model on a system-on-module to demonstrate its\ndeployability on embedded platforms and high power efficiency.\n","authors":["Yutaka Endo","Minoru Oikawa","Timothy D. Wilkinson","Tomoyoshi Shimobaba","Tomoyoshi Ito"],"pdf_url":"https://arxiv.org/pdf/2409.06711v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.04312v2","updated":"2024-10-31T17:47:54Z","published":"2024-06-06T17:56:40Z","title":"ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise\n Optimization","summary":" Text-to-Image (T2I) models have made significant advancements in recent\nyears, but they still struggle to accurately capture intricate details\nspecified in complex compositional prompts. While fine-tuning T2I models with\nreward objectives has shown promise, it suffers from \"reward hacking\" and may\nnot generalize well to unseen prompt distributions. In this work, we propose\nReward-based Noise Optimization (ReNO), a novel approach that enhances T2I\nmodels at inference by optimizing the initial noise based on the signal from\none or multiple human preference reward models. Remarkably, solving this\noptimization problem with gradient ascent for 50 iterations yields impressive\nresults on four different one-step models across two competitive benchmarks,\nT2I-CompBench and GenEval. Within a computational budget of 20-50 seconds,\nReNO-enhanced one-step models consistently surpass the performance of all\ncurrent open-source Text-to-Image models. Extensive user studies demonstrate\nthat our model is preferred nearly twice as often compared to the popular SDXL\nmodel and is on par with the proprietary Stable Diffusion 3 with 8B parameters.\nMoreover, given the same computational resources, a ReNO-optimized one-step\nmodel outperforms widely-used open-source models such as SDXL and\nPixArt-$\\alpha$, highlighting the efficiency and effectiveness of ReNO in\nenhancing T2I model performance at inference time. Code is available at\nhttps://github.com/ExplainableML/ReNO.\n","authors":["Luca Eyring","Shyamgopal Karthik","Karsten Roth","Alexey Dosovitskiy","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2406.04312v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2306.01953v3","updated":"2024-10-31T17:47:21Z","published":"2023-06-02T23:29:28Z","title":"Invisible Image Watermarks Are Provably Removable Using Generative AI","summary":" Invisible watermarks safeguard images' copyrights by embedding hidden\nmessages only detectable by owners. They also prevent people from misusing\nimages, especially those generated by AI models. We propose a family of\nregeneration attacks to remove these invisible watermarks. The proposed attack\nmethod first adds random noise to an image to destroy the watermark and then\nreconstructs the image. This approach is flexible and can be instantiated with\nmany existing image-denoising algorithms and pre-trained generative models such\nas diffusion models. Through formal proofs and extensive empirical evaluations,\nwe demonstrate that pixel-level invisible watermarks are vulnerable to this\nregeneration attack. Our results reveal that, across four different pixel-level\nwatermarking schemes, the proposed method consistently achieves superior\nperformance compared to existing attack techniques, with lower detection rates\nand higher image quality. However, watermarks that keep the image semantically\nsimilar can be an alternative defense against our attacks. Our finding\nunderscores the need for a shift in research/industry emphasis from invisible\nwatermarks to semantic-preserving watermarks. Code is available at\nhttps://github.com/XuandongZhao/WatermarkAttacker\n","authors":["Xuandong Zhao","Kexun Zhang","Zihao Su","Saastha Vasan","Ilya Grishchenko","Christopher Kruegel","Giovanni Vigna","Yu-Xiang Wang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2306.01953v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24183v1","updated":"2024-10-31T17:46:54Z","published":"2024-10-31T17:46:54Z","title":"Extended Object Tracking and Classification based on Linear Splines","summary":" This paper introduces a framework based on linear splines for 2-dimensional\nextended object tracking and classification. Unlike state of the art models,\nlinear splines allow to represent extended objects whose contour is an\narbitrarily complex curve. An exact likelihood is derived for the case in which\nnoisy measurements can be scattered from any point on the contour of the\nextended object, while an approximate Monte Carlo likelihood is provided for\nthe case wherein scattering points can be anywhere, i.e. inside or on the\ncontour, on the object surface. Exploiting such likelihood to measure how well\nthe observed data fit a given shape, a suitable estimator is developed. The\nproposed estimator models the extended object in terms of a kinematic state,\nproviding object position and orientation, along with a shape vector,\ncharacterizing object contour and surface. The kinematic state is estimated via\na nonlinear Kalman filter, while the shape vector is estimated via a Bayesian\nclassifier so that classification is implicitly solved during shape estimation.\nNumerical experiments are provided to assess, compared to state of the art\nextended object estimators, the effectiveness of the proposed one.\n","authors":["Matteo Tesori","Giorgio Battistelli","Luigi Chisci"],"pdf_url":"https://arxiv.org/pdf/2410.24183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24181v1","updated":"2024-10-31T17:45:09Z","published":"2024-10-31T17:45:09Z","title":"Federated Black-Box Adaptation for Semantic Segmentation","summary":" Federated Learning (FL) is a form of distributed learning that allows\nmultiple institutions or clients to collaboratively learn a global model to\nsolve a task. This allows the model to utilize the information from every\ninstitute while preserving data privacy. However, recent studies show that the\npromise of protecting the privacy of data is not upheld by existing methods and\nthat it is possible to recreate the training data from the different\ninstitutions. This is done by utilizing gradients transferred between the\nclients and the global server during training or by knowing the model\narchitecture at the client end. In this paper, we propose a federated learning\nframework for semantic segmentation without knowing the model architecture nor\ntransferring gradients between the client and the server, thus enabling better\nprivacy preservation. We propose BlackFed - a black-box adaptation of neural\nnetworks that utilizes zero order optimization (ZOO) to update the client model\nweights and first order optimization (FOO) to update the server weights. We\nevaluate our approach on several computer vision and medical imaging datasets\nto demonstrate its effectiveness. To the best of our knowledge, this work is\none of the first works in employing federated learning for segmentation, devoid\nof gradients or model information exchange. Code:\nhttps://github.com/JayParanjape/blackfed/tree/master\n","authors":["Jay N. Paranjape","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2410.24181v1.pdf","comment":"Accepted at NEURIPS 2024"},{"id":"http://arxiv.org/abs/2404.13046v2","updated":"2024-10-31T17:39:34Z","published":"2024-04-19T17:59:48Z","title":"MoVA: Adapting Mixture of Vision Experts to Multimodal Context","summary":" As the key component in multimodal large language models (MLLMs), the ability\nof the visual encoder greatly affects MLLM's understanding on diverse image\ncontent. Although some large-scale pretrained vision encoders such as vision\nencoders in CLIP and DINOv2 have brought promising performance, we found that\nthere is still no single vision encoder that can dominate various image content\nunderstanding, e.g., the CLIP vision encoder leads to outstanding results on\ngeneral image understanding but poor performance on document or chart content.\nTo alleviate the bias of CLIP vision encoder, we first delve into the inherent\nbehavior of different pre-trained vision encoders and then propose the MoVA, a\npowerful and novel MLLM, adaptively routing and fusing task-specific vision\nexperts with a coarse-to-fine mechanism. In the coarse-grained stage, we design\na context-aware expert routing strategy to dynamically select the most suitable\nvision experts according to the user instruction, input image, and expertise of\nvision experts. This benefits from the powerful model function understanding\nability of the large language model (LLM). In the fine-grained stage, we\nelaborately conduct the mixture-of-vision-expert adapter (MoV-Adapter) to\nextract and fuse task-specific knowledge from various experts. This\ncoarse-to-fine paradigm effectively leverages representations from experts\nbased on multimodal context and model expertise, further enhancing the\ngeneralization ability. We conduct extensive experiments to evaluate the\neffectiveness of the proposed approach. Without any bells and whistles, MoVA\ncan achieve significant performance gains over current state-of-the-art methods\nin a wide range of challenging multimodal benchmarks.\n","authors":["Zhuofan Zong","Bingqi Ma","Dazhong Shen","Guanglu Song","Hao Shao","Dongzhi Jiang","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13046v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.04690v3","updated":"2024-10-31T17:32:26Z","published":"2024-03-07T17:35:58Z","title":"Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self\n Attention at the Threadblock Level","summary":" Neighborhood attention reduces the cost of self attention by restricting each\ntoken's attention span to its nearest neighbors. This restriction,\nparameterized by a window size and dilation factor, draws a spectrum of\npossible attention patterns between linear projection and self attention.\nNeighborhood attention, and more generally sliding window attention patterns,\nhave long been bounded by infrastructure, particularly in higher-rank spaces\n(2-D and 3-D), calling for the development of custom kernels, which have been\nlimited in either functionality, or performance, if not both. In this work, we\naim to massively improve upon existing infrastructure by providing two new\nmethods for implementing neighborhood attention. We first show that\nneighborhood attention can be represented as a batched GEMM problem, similar to\nstandard attention, and implement it for 1-D and 2-D neighborhood attention.\nThese kernels on average provide 895% and 272% improvement in full precision\nruntime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood\nattention respectively. We find that aside from being heavily bound by memory\nbandwidth, certain inherent inefficiencies exist in all unfused implementations\nof neighborhood attention, which in most cases undo their theoretical\nefficiency gain. Motivated by the progress made into fused dot-product\nattention kernels, we developed fused neighborhood attention; an adaptation of\nfused dot-product attention kernels that allow fine-grained control over\nattention across different spatial axes. Known for reducing the quadratic time\ncomplexity of self attention to a linear complexity, neighborhood attention can\nnow enjoy a reduced and constant memory footprint, and record-breaking half\nprecision runtime. We observe that our fused implementation successfully\ncircumvents some of the unavoidable inefficiencies in unfused\nimplementations...\n","authors":["Ali Hassani","Wen-Mei Hwu","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2403.04690v3.pdf","comment":"To appear in 38th Conference on Neural Information Processing Systems\n (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.24160v1","updated":"2024-10-31T17:19:03Z","published":"2024-10-31T17:19:03Z","title":"Redefining in Dictionary: Towards a Enhanced Semantic\n Understanding of Creative Generation","summary":" Creativity, both in human and diffusion models, remains an inherently\nabstract concept; thus, simply adding \"creative\" to a prompt does not yield\nreliable semantic recognition by the model. In this work, we concretize the\nabstract notion of \"creative\" through the TP2O task, which aims to merge two\nunrelated concepts, and introduce CreTok, redefining \"creative\" as the token\n$\\texttt{}$. This redefinition offers a more concrete and universally\nadaptable representation for concept blending. This redefinition occurs\ncontinuously, involving the repeated random sampling of text pairs with\ndifferent concepts and optimizing cosine similarity between target and constant\nprompts. This approach enables $\\texttt{}$ to learn a method for\ncreative concept fusion. Extensive experiments demonstrate that the creative\ncapability enabled by $\\texttt{}$ substantially surpasses recent SOTA\ndiffusion models and achieves superior creative generation. CreTok exhibits\ngreater flexibility and reduced time overhead, as $\\texttt{}$ can\nfunction as a universal token for any concept, facilitating creative generation\nwithout retraining.\n","authors":["Fu Feng","Yucheng Xie","Jing Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2410.24160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00885v2","updated":"2024-10-31T17:12:57Z","published":"2024-06-02T22:40:05Z","title":"Visual place recognition for aerial imagery: A survey","summary":" Aerial imagery and its direct application to visual localization is an\nessential problem for many Robotics and Computer Vision tasks. While Global\nNavigation Satellite Systems (GNSS) are the standard default solution for\nsolving the aerial localization problem, it is subject to a number of\nlimitations, such as, signal instability or solution unreliability that make\nthis option not so desirable. Consequently, visual geolocalization is emerging\nas a viable alternative. However, adapting Visual Place Recognition (VPR) task\nto aerial imagery presents significant challenges, including weather variations\nand repetitive patterns. Current VPR reviews largely neglect the specific\ncontext of aerial data. This paper introduces a methodology tailored for\nevaluating VPR techniques specifically in the domain of aerial imagery,\nproviding a comprehensive assessment of various methods and their performance.\nHowever, we not only compare various VPR methods, but also demonstrate the\nimportance of selecting appropriate zoom and overlap levels when constructing\nmap tiles to achieve maximum efficiency of VPR algorithms in the case of aerial\nimagery. The code is available on our GitHub repository --\nhttps://github.com/prime-slam/aero-vloc.\n","authors":["Ivan Moskalenko","Anastasiia Kornilova","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2406.00885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24151v1","updated":"2024-10-31T17:09:55Z","published":"2024-10-31T17:09:55Z","title":"Scaling Concept With Text-Guided Diffusion Models","summary":" Text-guided diffusion models have revolutionized generative tasks by\nproducing high-fidelity content from text descriptions. They have also enabled\nan editing paradigm where concepts can be replaced through text conditioning\n(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of\nreplacing a concept, can we enhance or suppress the concept itself? Through an\nempirical study, we identify a trend where concepts can be decomposed in\ntext-guided diffusion models. Leveraging this insight, we introduce\nScalingConcept, a simple yet effective method to scale decomposed concepts up\nor down in real input without introducing new elements. To systematically\nevaluate our approach, we present the WeakConcept-10 dataset, where concepts\nare imperfect and need to be enhanced. More importantly, ScalingConcept enables\na variety of novel zero-shot applications across image and audio domains,\nincluding tasks such as canonical pose generation and generative sound\nhighlighting or removal.\n","authors":["Chao Huang","Susan Liang","Yunlong Tang","Yapeng Tian","Anurag Kumar","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24151v1.pdf","comment":"Project page: https://wikichao.github.io/ScalingConcept/"},{"id":"http://arxiv.org/abs/2410.24148v1","updated":"2024-10-31T17:09:19Z","published":"2024-10-31T17:09:19Z","title":"Exploring Vision Language Models for Facial Attribute Recognition:\n Emotion, Race, Gender, and Age","summary":" Technologies for recognizing facial attributes like race, gender, age, and\nemotion have several applications, such as surveillance, advertising content,\nsentiment analysis, and the study of demographic trends and social behaviors.\nAnalyzing demographic characteristics based on images and analyzing facial\nexpressions have several challenges due to the complexity of humans' facial\nattributes. Traditional approaches have employed CNNs and various other deep\nlearning techniques, trained on extensive collections of labeled images. While\nthese methods demonstrated effective performance, there remains potential for\nfurther enhancements. In this paper, we propose to utilize vision language\nmodels (VLMs) such as generative pre-trained transformer (GPT), GEMINI, large\nlanguage and vision assistant (LLAVA), PaliGemma, and Microsoft Florence2 to\nrecognize facial attributes such as race, gender, age, and emotion from images\nwith human faces. Various datasets like FairFace, AffectNet, and UTKFace have\nbeen utilized to evaluate the solutions. The results show that VLMs are\ncompetitive if not superior to traditional techniques. Additionally, we propose\n\"FaceScanPaliGemma\"--a fine-tuned PaliGemma model--for race, gender, age, and\nemotion recognition. The results show an accuracy of 81.1%, 95.8%, 80%, and\n59.4% for race, gender, age group, and emotion classification, respectively,\noutperforming pre-trained version of PaliGemma, other VLMs, and SotA methods.\nFinally, we propose \"FaceScanGPT\", which is a GPT-4o model to recognize the\nabove attributes when several individuals are present in the image using a\nprompt engineered for a person with specific facial and/or physical attributes.\nThe results underscore the superior multitasking capability of FaceScanGPT to\ndetect the individual's attributes like hair cut, clothing color, postures,\netc., using only a prompt to drive the detection and recognition tasks.\n","authors":["Nouar AlDahoul","Myles Joshua Toledo Tan","Harishwar Reddy Kasireddy","Yasir Zaki"],"pdf_url":"https://arxiv.org/pdf/2410.24148v1.pdf","comment":"52 pages, 13 figures"},{"id":"http://arxiv.org/abs/2410.24144v1","updated":"2024-10-31T17:05:44Z","published":"2024-10-31T17:05:44Z","title":"HoloChrome: Polychromatic Illumination for Speckle Reduction in\n Holographic Near-Eye Displays","summary":" Holographic displays hold the promise of providing authentic depth cues,\nresulting in enhanced immersive visual experiences for near-eye applications.\nHowever, current holographic displays are hindered by speckle noise, which\nlimits accurate reproduction of color and texture in displayed images. We\npresent HoloChrome, a polychromatic holographic display framework designed to\nmitigate these limitations. HoloChrome utilizes an ultrafast,\nwavelength-adjustable laser and a dual-Spatial Light Modulator (SLM)\narchitecture, enabling the multiplexing of a large set of discrete wavelengths\nacross the visible spectrum. By leveraging spatial separation in our dual-SLM\nsetup, we independently manipulate speckle patterns across multiple\nwavelengths. This novel approach effectively reduces speckle noise through\nincoherent averaging achieved by wavelength multiplexing. Our method is\ncomplementary to existing speckle reduction techniques, offering a new pathway\nto address this challenge. Furthermore, the use of polychromatic illumination\nbroadens the achievable color gamut compared to traditional three-color primary\nholographic displays.\n Our simulations and tabletop experiments validate that HoloChrome\nsignificantly reduces speckle noise and expands the color gamut. These\nadvancements enhance the performance of holographic near-eye displays, moving\nus closer to practical, immersive next-generation visual experiences.\n","authors":["Florian Schiffers","Grace Kuo","Nathan Matsuda","Douglas Lanman","Oliver Cossairt"],"pdf_url":"https://arxiv.org/pdf/2410.24144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24139v1","updated":"2024-10-31T17:03:38Z","published":"2024-10-31T17:03:38Z","title":"COSNet: A Novel Semantic Segmentation Network using Enhanced Boundaries\n in Cluttered Scenes","summary":" Automated waste recycling aims to efficiently separate the recyclable objects\nfrom the waste by employing vision-based systems. However, the presence of\nvarying shaped objects having different material types makes it a challenging\nproblem, especially in cluttered environments. Existing segmentation methods\nperform reasonably on many semantic segmentation datasets by employing\nmulti-contextual representations, however, their performance is degraded when\nutilized for waste object segmentation in cluttered scenarios. In addition,\nplastic objects further increase the complexity of the problem due to their\ntranslucent nature. To address these limitations, we introduce an efficacious\nsegmentation network, named COSNet, that uses boundary cues along with\nmulti-contextual information to accurately segment the objects in cluttered\nscenes. COSNet introduces novel components including feature sharpening block\n(FSB) and boundary enhancement module (BEM) for enhancing the features and\nhighlighting the boundary information of irregular waste objects in cluttered\nenvironment. Extensive experiments on three challenging datasets including\nZeroWaste-f, SpectralWaste, and ADE20K demonstrate the effectiveness of the\nproposed method. Our COSNet achieves a significant gain of 1.8% on ZeroWaste-f\nand 2.1% on SpectralWaste datasets respectively in terms of mIoU metric.\n","authors":["Muhammad Ali","Mamoona Javaid","Mubashir Noman","Mustansar Fiaz","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2410.24139v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2311.00371v2","updated":"2024-10-31T17:01:50Z","published":"2023-11-01T08:53:05Z","title":"Learning Cooperative Trajectory Representations for Motion Forecasting","summary":" Motion forecasting is an essential task for autonomous driving, and utilizing\ninformation from infrastructure and other vehicles can enhance forecasting\ncapabilities. Existing research mainly focuses on leveraging single-frame\ncooperative information to enhance the limited perception capability of the ego\nvehicle, while underutilizing the motion and interaction context of traffic\nparticipants observed from cooperative devices. In this paper, we propose a\nforecasting-oriented representation paradigm to utilize motion and interaction\nfeatures from cooperative information. Specifically, we present V2X-Graph, a\nrepresentative framework to achieve interpretable and end-to-end trajectory\nfeature fusion for cooperative motion forecasting. V2X-Graph is evaluated on\nV2X-Seq in vehicle-to-infrastructure (V2I) scenarios. To further evaluate on\nvehicle-to-everything (V2X) scenario, we construct the first real-world V2X\nmotion forecasting dataset V2X-Traj, which contains multiple autonomous\nvehicles and infrastructure in every scenario. Experimental results on both\nV2X-Seq and V2X-Traj show the advantage of our method. We hope both V2X-Graph\nand V2X-Traj will benefit the further development of cooperative motion\nforecasting. Find the project at https://github.com/AIR-THU/V2X-Graph.\n","authors":["Hongzhi Ruan","Haibao Yu","Wenxian Yang","Siqi Fan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2311.00371v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.01650v3","updated":"2024-10-31T16:53:49Z","published":"2024-01-03T10:07:11Z","title":"De-Confusing Pseudo-Labels in Source-Free Domain Adaptation","summary":" Source-free domain adaptation aims to adapt a source-trained model to an\nunlabeled target domain without access to the source data. It has attracted\ngrowing attention in recent years, where existing approaches focus on\nself-training that usually includes pseudo-labeling techniques. In this paper,\nwe introduce a novel noise-learning approach tailored to address noise\ndistribution in domain adaptation settings and learn to de-confuse the\npseudo-labels. More specifically, we learn a noise transition matrix of the\npseudo-labels to capture the label corruption of each class and learn the\nunderlying true label distribution. Estimating the noise transition matrix\nenables a better true class-posterior estimation, resulting in better\nprediction accuracy. We demonstrate the effectiveness of our approach when\ncombined with several source-free domain adaptation methods: SHOT, SHOT++, and\nAaD. We obtain state-of-the-art results on three domain adaptation datasets:\nVisDA, DomainNet, and OfficeHome.\n","authors":["Idit Diamant","Amir Rosenfeld","Idan Achituve","Jacob Goldberger","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2401.01650v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12306v2","updated":"2024-10-31T16:49:59Z","published":"2024-09-18T20:33:54Z","title":"Measuring Sound Symbolism in Audio-visual Models","summary":" Audio-visual pre-trained models have gained substantial attention recently\nand demonstrated superior performance on various audio-visual tasks. This study\ninvestigates whether pre-trained audio-visual models demonstrate non-arbitrary\nassociations between sounds and visual representations$\\unicode{x2013}$known as\nsound symbolism$\\unicode{x2013}$which is also observed in humans. We developed\na specialized dataset with synthesized images and audio samples and assessed\nthese models using a non-parametric approach in a zero-shot setting. Our\nfindings reveal a significant correlation between the models' outputs and\nestablished patterns of sound symbolism, particularly in models trained on\nspeech data. These results suggest that such models can capture sound-meaning\nconnections akin to human language processing, providing insights into both\ncognitive architectures and machine learning strategies.\n","authors":["Wei-Cheng Tseng","Yi-Jen Shih","David Harwath","Raymond Mooney"],"pdf_url":"https://arxiv.org/pdf/2409.12306v2.pdf","comment":"Errors in the introduction part that might potentially affect the\n integrity of the paper. Withdraw at the point. Will replace with an updated\n version in the future"},{"id":"http://arxiv.org/abs/2407.01903v2","updated":"2024-10-31T16:49:26Z","published":"2024-07-02T03:08:20Z","title":"Text-Aware Diffusion for Policy Learning","summary":" Training an agent to achieve particular goals or perform desired behaviors is\noften accomplished through reinforcement learning, especially in the absence of\nexpert demonstrations. However, supporting novel goals or behaviors through\nreinforcement learning requires the ad-hoc design of appropriate reward\nfunctions, which quickly becomes intractable. To address this challenge, we\npropose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a\npretrained, frozen text-conditioned diffusion model to compute dense zero-shot\nreward signals for text-aligned policy learning. We hypothesize that\nlarge-scale pretrained generative models encode rich priors that can supervise\na policy to behave not only in a text-aligned manner, but also in alignment\nwith a notion of naturalness summarized from internet-scale training data. In\nour experiments, we demonstrate that TADPoLe is able to learn policies for\nnovel goal-achievement and continuous locomotion behaviors specified by natural\nlanguage, in both Humanoid and Dog environments. The behaviors are learned\nzero-shot without ground-truth rewards or expert demonstrations, and are\nqualitatively more natural according to human evaluation. We further show that\nTADPoLe performs competitively when applied to robotic manipulation tasks in\nthe Meta-World environment, without having access to any in-domain\ndemonstrations.\n","authors":["Calvin Luo","Mandy He","Zilai Zeng","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24116v1","updated":"2024-10-31T16:46:23Z","published":"2024-10-31T16:46:23Z","title":"AIDOVECL: AI-generated Dataset of Outpainted Vehicles for Eye-level\n Classification and Localization","summary":" Image labeling is a critical bottleneck in the development of computer vision\ntechnologies, often constraining the potential of machine learning models due\nto the time-intensive nature of manual annotations. This work introduces a\nnovel approach that leverages outpainting to address the problem of annotated\ndata scarcity by generating artificial contexts and annotations, significantly\nreducing manual labeling efforts. We apply this technique to a particularly\nacute challenge in autonomous driving, urban planning, and environmental\nmonitoring: the lack of diverse, eye-level vehicle images in desired classes.\nOur dataset comprises AI-generated vehicle images obtained by detecting and\ncropping vehicles from manually selected seed images, which are then outpainted\nonto larger canvases to simulate varied real-world conditions. The outpainted\nimages include detailed annotations, providing high-quality ground truth data.\nAdvanced outpainting techniques and image quality assessments ensure visual\nfidelity and contextual relevance. Augmentation with outpainted vehicles\nimproves overall performance metrics by up to 8\\% and enhances prediction of\nunderrepresented classes by up to 20\\%. This approach, exemplifying outpainting\nas a self-annotating paradigm, presents a solution that enhances dataset\nversatility across multiple domains of machine learning. The code and links to\ndatasets used in this study are available for further research and replication\nat https://github.com/amir-kazemi/aidovecl.\n","authors":["Amir Kazemi","Qurat ul ain Fatima","Volodymyr Kindratenko","Christopher Tessum"],"pdf_url":"https://arxiv.org/pdf/2410.24116v1.pdf","comment":"19 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.24114v1","updated":"2024-10-31T16:44:10Z","published":"2024-10-31T16:44:10Z","title":"Nearest Neighbor Normalization Improves Multimodal Retrieval","summary":" Multimodal models leverage large-scale pre-training to achieve strong but\nstill imperfect performance on tasks such as image captioning, visual question\nanswering, and cross-modal retrieval. In this paper, we present a simple and\nefficient method for correcting errors in trained contrastive image-text\nretrieval models with no additional training, called Nearest Neighbor\nNormalization (NNN). We show an improvement on retrieval metrics in both text\nretrieval and image retrieval for all of the contrastive models that we tested\n(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used\n(MS-COCO and Flickr30k). NNN requires a reference database, but does not\nrequire any training on this database, and can even increase the retrieval\naccuracy of a model after finetuning.\n","authors":["Neil Chowdhury","Franklin Wang","Sumedh Shenoy","Douwe Kiela","Sarah Schwettmann","Tristan Thrush"],"pdf_url":"https://arxiv.org/pdf/2410.24114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14919v2","updated":"2024-10-31T16:36:14Z","published":"2024-10-19T00:33:51Z","title":"Adversarial Score identity Distillation: Rapidly Surpassing the Teacher\n in One Step","summary":" Score identity Distillation (SiD) is a data-free method that has achieved\nstate-of-the-art performance in image generation by leveraging only a\npretrained diffusion model, without requiring any training data. However, the\nultimate performance of SiD is constrained by the accuracy with which the\npretrained model captures the true data scores at different stages of the\ndiffusion process. In this paper, we introduce SiDA (SiD with Adversarial\nLoss), which not only enhances generation quality but also improves\ndistillation efficiency by incorporating real images and adversarial loss. SiDA\nutilizes the encoder from the generator's score network as a discriminator,\nboosting its ability to distinguish between real images and those generated by\nSiD. The adversarial loss is batch-normalized within each GPU and then combined\nwith the original SiD loss. This integration effectively incorporates the\naverage \"fakeness\" per GPU batch into the pixel-based SiD loss, enabling SiDA\nto distill a single-step generator either from scratch or by fine-tuning an\nexisting one. SiDA converges significantly faster than its predecessor when\ntrained from scratch, and swiftly improves upon the original model's\nperformance after an initial warmup period during fine-tuning from a\npre-distilled SiD generator. This one-step adversarial distillation method\nestablishes new benchmarks in generation performance when distilling EDM\ndiffusion models pretrained on CIFAR-10 (32x32) and ImageNet (64x64), achieving\nFID score of 1.110 on ImageNet 64x64. It sets record-low FID scores when\ndistilling EDM2 models trained on ImageNet (512x512), surpassing even the\nlargest teacher model, EDM2-XXL. Our SiDA's results record FID scores of 2.156\nfor EDM2-XS, 1.669 for EDM2-S, 1.488 for EDM2-M, and 1.465 for EDM2-L,\ndemonstrating significant improvements across all model sizes. Our open-source\ncode will be integrated into the SiD codebase.\n","authors":["Mingyuan Zhou","Huangjie Zheng","Yi Gu","Zhendong Wang","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2410.14919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24098v1","updated":"2024-10-31T16:28:49Z","published":"2024-10-31T16:28:49Z","title":"Parameter choices in HaarPSI for IQA with medical images","summary":" When developing machine learning models, image quality assessment (IQA)\nmeasures are a crucial component for evaluation. However, commonly used IQA\nmeasures have been primarily developed and optimized for natural images. In\nmany specialized settings, such as medical images, this poses an\noften-overlooked problem regarding suitability. In previous studies, the IQA\nmeasure HaarPSI showed promising behavior for natural and medical images.\nHaarPSI is based on Haar wavelet representations and the framework allows\noptimization of two parameters. So far, these parameters have been aligned for\nnatural images. Here, we optimize these parameters for two annotated medical\ndata sets, a photoacoustic and a chest X-Ray data set. We observe that they are\nmore sensitive to the parameter choices than the employed natural images, and\non the other hand both medical data sets lead to similar parameter values when\noptimized. We denote the optimized setting, which improves the performance for\nthe medical images notably, by HaarPSI$_{MED}$. The results suggest that\nadapting common IQA measures within their frameworks for medical images can\nprovide a valuable, generalizable addition to the employment of more specific\ntask-based measures.\n","authors":["Clemens Karner","Janek Gröhl","Ian Selby","Judith Babar","Jake Beckford","Thomas R Else","Timothy J Sadler","Shahab Shahipasand","Arthikkaa Thavakumar","Michael Roberts","James H. F. Rudd","Carola-Bibiane Schönlieb","Jonathan R Weir-McCall","Anna Breger"],"pdf_url":"https://arxiv.org/pdf/2410.24098v1.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.08570v2","updated":"2024-10-31T16:20:26Z","published":"2024-08-16T07:12:47Z","title":"EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver\n Attention Estimation","summary":" Associating driver attention with driving scene across two fields of views\n(FOVs) is a hard cross-domain perception problem, which requires comprehensive\nconsideration of cross-view mapping, dynamic driving scene analysis, and driver\nstatus tracking. Previous methods typically focus on a single view or map\nattention to the scene via estimated gaze, failing to exploit the implicit\nconnection between them. Moreover, simple fusion modules are insufficient for\nmodeling the complex relationships between the two views, making information\nintegration challenging. To address these issues, we propose a novel method for\nend-to-end scene-associated driver attention estimation, called EraW-Net. This\nmethod enhances the most discriminative dynamic cues, refines feature\nrepresentations, and facilitates semantically aligned cross-domain integration\nthrough a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive\nFilter Module (DAF-Module) is proposed to address the challenges of frequently\nchanging driving environments by extracting vital regions. It suppresses the\nindiscriminately recorded dynamics and highlights crucial ones by innovative\njoint frequency-spatial analysis, enhancing the model's ability to parse\ncomplex dynamics. Additionally, to track driver states during non-fixed facial\nposes, we propose a Global Context Sharing Module (GCS-Module) to construct\nrefined feature representations by capturing hierarchical features that adapt\nto various scales of head and eye movements. Finally, W-Net achieves systematic\ncross-view information integration through its \"Encoding-Independent Partial\nDecoding-Fusion Decoding\" structure, addressing semantic misalignment in\nheterogeneous data integration. Experiments demonstrate that the proposed\nmethod robustly and accurately estimates the mapping of driver attention in\nscene on large public datasets.\n","authors":["Jun Zhou","Chunsheng Liu","Faliang Chang","Wenqian Wang","Penghui Hao","Yiming Huang","Zhiqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08570v2.pdf","comment":"13pages, 9 figures"},{"id":"http://arxiv.org/abs/2410.24075v1","updated":"2024-10-31T16:13:55Z","published":"2024-10-31T16:13:55Z","title":"Identifying Spatio-Temporal Drivers of Extreme Events","summary":" The spatio-temporal relations of impacts of extreme events and their drivers\nin climate data are not fully understood and there is a need of machine\nlearning approaches to identify such spatio-temporal relations from data. The\ntask, however, is very challenging since there are time delays between extremes\nand their drivers, and the spatial response of such drivers is inhomogeneous.\nIn this work, we propose a first approach and benchmarks to tackle this\nchallenge. Our approach is trained end-to-end to predict spatio-temporally\nextremes and spatio-temporally drivers in the physical input variables jointly.\nBy enforcing the network to predict extremes from spatio-temporal binary masks\nof identified drivers, the network successfully identifies drivers that are\ncorrelated with extremes. We evaluate our approach on three newly created\nsynthetic benchmarks, where two of them are based on remote sensing or\nreanalysis climate data, and on two real-world reanalysis datasets. The source\ncode and datasets are publicly available at the project page\nhttps://hakamshams.github.io/IDE.\n","authors":["Mohamad Hakam Shams Eddin","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2410.24075v1.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.22551v2","updated":"2024-10-31T16:04:48Z","published":"2024-10-29T21:37:03Z","title":"FairSkin: Fair Diffusion for Skin Disease Image Generation","summary":" Image generation is a prevailing technique for clinical data augmentation for\nadvancing diagnostic accuracy and reducing healthcare disparities. Diffusion\nModel (DM) has become a leading method in generating synthetic medical images,\nbut it suffers from a critical twofold bias: (1) The quality of images\ngenerated for Caucasian individuals is significantly higher, as measured by the\nFrechet Inception Distance (FID). (2) The ability of the downstream-task\nlearner to learn critical features from disease images varies across different\nskin tones. These biases pose significant risks, particularly in skin disease\ndetection, where underrepresentation of certain skin tones can lead to\nmisdiagnosis or neglect of specific conditions. To address these challenges, we\npropose FairSkin, a novel DM framework that mitigates these biases through a\nthree-level resampling mechanism, ensuring fairer representation across racial\nand disease categories. Our approach significantly improves the diversity and\nquality of generated images, contributing to more equitable skin disease\ndetection in clinical settings.\n","authors":["Ruichen Zhang","Yuguang Yao","Zhen Tan","Zhiming Li","Pan Wang","Huan Liu","Jingtong Hu","Sijia Liu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24060v1","updated":"2024-10-31T15:57:04Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08557v2","updated":"2024-10-31T15:52:52Z","published":"2023-11-14T21:39:15Z","title":"Low-light Pedestrian Detection in Visible and Infrared Image Feeds:\n Issues and Challenges","summary":" Pedestrian detection has become a cornerstone for several high-level tasks,\nincluding autonomous driving, intelligent transportation, and traffic\nsurveillance. There are several works focussed on pedestrian detection using\nvisible images, mainly in the daytime. However, this task is very intriguing\nwhen the environmental conditions change to poor lighting or nighttime.\nRecently, new ideas have been spurred to use alternative sources, such as Far\nInfraRed (FIR) temperature sensor feeds for detecting pedestrians in low-light\nconditions. This study reviews recent developments in low-light pedestrian\ndetection approaches. It systematically categorizes and analyses various\nalgorithms from region-based to non-region-based and graph-based learning\nmethodologies by highlighting their methodologies, implementation issues, and\nchallenges. It also outlines the key benchmark datasets that can be used for\nresearch and development of advanced pedestrian detection algorithms,\nparticularly in low-light situations.\n","authors":["Thangarajah Akilan","Hrishikesh Vachhani"],"pdf_url":"https://arxiv.org/pdf/2311.08557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24055v1","updated":"2024-10-31T15:48:36Z","published":"2024-10-31T15:48:36Z","title":"Advanced Predictive Quality Assessment for Ultrasonic Additive\n Manufacturing with Deep Learning Model","summary":" Ultrasonic Additive Manufacturing (UAM) employs ultrasonic welding to bond\nsimilar or dissimilar metal foils to a substrate, resulting in solid,\nconsolidated metal components. However, certain processing conditions can lead\nto inter-layer defects, affecting the final product's quality. This study\ndevelops a method to monitor in-process quality using deep learning-based\nconvolutional neural networks (CNNs). The CNN models were evaluated on their\nability to classify samples with and without embedded thermocouples across five\npower levels (300W, 600W, 900W, 1200W, 1500W) using thermal images with\nsupervised labeling. Four distinct CNN classification models were created for\ndifferent scenarios including without (baseline) and with thermocouples, only\nwithout thermocouples across power levels, only with thermocouples across power\nlevels, and combined without and with thermocouples across power levels. The\nmodels achieved 98.29% accuracy on combined baseline and thermocouple images,\n97.10% for baseline images across power levels, 97.43% for thermocouple images,\nand 97.27% for both types across power levels. The high accuracy, above 97%,\ndemonstrates the system's effectiveness in identifying and classifying\nconditions within the UAM process, providing a reliable tool for quality\nassurance and process control in manufacturing environments.\n","authors":["Lokendra Poudel","Sushant Jha","Ryan Meeker","Duy-Nhat Phan","Rahul Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2410.24055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16129v2","updated":"2024-10-31T15:46:45Z","published":"2024-06-23T15:03:35Z","title":"UDHF2-Net: Uncertainty-diffusion-model-based High-Frequency TransFormer\n Network for Remotely Sensed Imagery Interpretation","summary":" Remotely sensed imagery interpretation (RSII) faces the three major problems:\n(1) objective representation of spatial distribution patterns; (2) edge\nuncertainty problem caused by downsampling encoder and intrinsic edge noises\n(e.g., mixed pixel and edge occlusion etc.); and (3) false detection problem\ncaused by geometric registration error in change detection. To solve the\naforementioned problems, uncertainty-diffusion-model-based high-Frequency\nTransFormer network (UDHF2-Net) is the first to be proposed, whose\nsuperiorities are as follows: (1) a spatially-stationary-and-non-stationary\nhigh-frequency connection paradigm (SHCP) is proposed to enhance the\ninteraction of spatially frequency-wise stationary and non-stationary features\nto yield high-fidelity edge extraction result. Inspired by HRFormer, SHCP\nproposes high-frequency-wise stream to replace high-resolution-wise stream in\nHRFormer through the whole encoder-decoder process with parallel frequency-wise\nhigh-to-low streams, so it improves the edge extraction accuracy by\ncontinuously remaining high-frequency information; (2) a\nmask-and-geo-knowledge-based uncertainty diffusion module (MUDM), which is a\nself-supervised learning strategy, is proposed to improve the edge accuracy of\nextraction and change detection by gradually removing the simulated spectrum\nnoises based on geo-knowledge and the generated diffused spectrum noises; (3) a\nfrequency-wise semi-pseudo-Siamese UDHF2-Net is the first to be proposed to\nbalance accuracy and complexity for change detection. Besides the\naforementioned spectrum noises in semantic segmentation, MUDM is also a\nself-supervised learning strategy to effectively reduce the edge false change\ndetection from the generated imagery with geometric registration error.\n","authors":["Pengfei Zhang","Chang Li","Yongjun Zhang","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2406.16129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09168v2","updated":"2024-10-31T15:44:36Z","published":"2024-06-13T14:30:35Z","title":"SR-CACO-2: A Dataset for Confocal Fluorescence Microscopy Image\n Super-Resolution","summary":" Confocal fluorescence microscopy is one of the most accessible and widely\nused imaging techniques for the study of biological processes at the cellular\nand subcellular levels. Scanning confocal microscopy allows the capture of\nhigh-quality images from thick three-dimensional (3D) samples, yet suffers from\nwell-known limitations such as photobleaching and phototoxicity of specimens\ncaused by intense light exposure, limiting its applications. Cellular damage\ncan be alleviated by changing imaging parameters to reduce light exposure,\noften at the expense of image quality. Machine/deep learning methods for\nsingle-image super-resolution (SISR) can be applied to restore image quality by\nupscaling lower-resolution (LR) images to yield high-resolution images (HR).\nThese SISR methods have been successfully applied to photo-realistic images due\npartly to the abundance of publicly available data. In contrast, the lack of\npublicly available data partly limits their application and success in scanning\nconfocal microscopy. In this paper, we introduce a large scanning confocal\nmicroscopy dataset named SR-CACO-2 that is comprised of low- and\nhigh-resolution image pairs marked for three different fluorescent markers. It\nallows the evaluation of performance of SISR methods on three different\nupscaling levels (X2, X4, X8). SR-CACO-2 contains the human epithelial cell\nline Caco-2 (ATCC HTB-37), and it is composed of 2,200 unique images, captured\nwith four resolutions and three markers, forming 9,937 image patches for SISR\nmethods. We provide benchmarking results for 16 state-of-the-art methods of the\nmain SISR families. Results show that these methods have limited success in\nproducing high-resolution textures. The dataset is freely accessible under a\nCreative Commons license (CC BY-NC-SA 4.0). Our dataset, code and pretrained\nweights for SISR methods are available: https://github.com/sbelharbi/sr-caco-2.\n","authors":["Soufiane Belharbi","Mara KM Whitford","Phuong Hoang","Shakeeb Murtaza","Luke McCaffrey","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2406.09168v2.pdf","comment":"27 pages, 15 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24046v1","updated":"2024-10-31T15:42:24Z","published":"2024-10-31T15:42:24Z","title":"Deep Learning with HM-VGG: AI Strategies for Multi-modal Image Analysis","summary":" This study introduces the Hybrid Multi-modal VGG (HM-VGG) model, a\ncutting-edge deep learning approach for the early diagnosis of glaucoma. The\nHM-VGG model utilizes an attention mechanism to process Visual Field (VF) data,\nenabling the extraction of key features that are vital for identifying early\nsigns of glaucoma. Despite the common reliance on large annotated datasets, the\nHM-VGG model excels in scenarios with limited data, achieving remarkable\nresults with small sample sizes. The model's performance is underscored by its\nhigh metrics in Precision, Accuracy, and F1-Score, indicating its potential for\nreal-world application in glaucoma detection. The paper also discusses the\nchallenges associated with ophthalmic image analysis, particularly the\ndifficulty of obtaining large volumes of annotated data. It highlights the\nimportance of moving beyond single-modality data, such as VF or Optical\nCoherence Tomography (OCT) images alone, to a multimodal approach that can\nprovide a richer, more comprehensive dataset. This integration of different\ndata types is shown to significantly enhance diagnostic accuracy. The HM- VGG\nmodel offers a promising tool for doctors, streamlining the diagnostic process\nand improving patient outcomes. Furthermore, its applicability extends to\ntelemedicine and mobile healthcare, making diagnostic services more accessible.\nThe research presented in this paper is a significant step forward in the field\nof medical image processing and has profound implications for clinical\nophthalmology.\n","authors":["Junliang Du","Yiru Cang","Tong Zhou","Jiacheng Hu","Weijie He"],"pdf_url":"https://arxiv.org/pdf/2410.24046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24037v1","updated":"2024-10-31T15:34:49Z","published":"2024-10-31T15:34:49Z","title":"TPC: Test-time Procrustes Calibration for Diffusion-based Human Image\n Animation","summary":" Human image animation aims to generate a human motion video from the inputs\nof a reference human image and a target motion video. Current diffusion-based\nimage animation systems exhibit high precision in transferring human identity\ninto targeted motion, yet they still exhibit irregular quality in their\noutputs. Their optimal precision is achieved only when the physical\ncompositions (i.e., scale and rotation) of the human shapes in the reference\nimage and target pose frame are aligned. In the absence of such alignment,\nthere is a noticeable decline in fidelity and consistency. Especially, in\nreal-world environments, this compositional misalignment commonly occurs,\nposing significant challenges to the practical usage of current systems. To\nthis end, we propose Test-time Procrustes Calibration (TPC), which enhances the\nrobustness of diffusion-based image animation systems by maintaining optimal\nperformance even when faced with compositional misalignment, effectively\naddressing real-world scenarios. The TPC provides a calibrated reference image\nfor the diffusion model, enhancing its capability to understand the\ncorrespondence between human shapes in the reference and target images. Our\nmethod is simple and can be applied to any diffusion-based image animation\nsystem in a model-agnostic manner, improving the effectiveness at test time\nwithout additional training.\n","authors":["Sunjae Yoon","Gwanhyeong Koo","Younghwan Lee","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2410.24037v1.pdf","comment":"24 pages, 16 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24034v1","updated":"2024-10-31T15:32:14Z","published":"2024-10-31T15:32:14Z","title":"Handwriting Recognition in Historical Documents with Multimodal LLM","summary":" There is an immense quantity of historical and cultural documentation that\nexists only as handwritten manuscripts. At the same time, performing OCR across\nscripts and different handwriting styles has proven to be an enormously\ndifficult problem relative to the process of digitizing print. While recent\nTransformer based models have achieved relatively strong performance, they rely\nheavily on manually transcribed training data and have difficulty generalizing\nacross writers. Multimodal LLM, such as GPT-4v and Gemini, have demonstrated\neffectiveness in performing OCR and computer vision tasks with few shot\nprompting. In this paper, I evaluate the accuracy of handwritten document\ntranscriptions generated by Gemini against the current state of the art\nTransformer based methods.\n Keywords: Optical Character Recognition, Multimodal Language Models, Cultural\nPreservation, Mass digitization, Handwriting Recognitio\n","authors":["Lucian Li"],"pdf_url":"https://arxiv.org/pdf/2410.24034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24031v1","updated":"2024-10-31T15:29:51Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n using Disparity Maps","summary":" Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24018v1","updated":"2024-10-31T15:20:43Z","published":"2024-10-31T15:20:43Z","title":"Bayesian-guided Label Mapping for Visual Reprogramming","summary":" Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained\nvision models by adapting their input or output interfaces to solve downstream\ntasks whose labels (i.e., downstream labels) might be totally different from\nthe labels associated with the pretrained models (i.e., pretrained labels).\nWhen adapting the output interface, label mapping methods transform the\npretrained labels to downstream labels by establishing a gradient-free\none-to-one correspondence between the two sets of labels. However, in this\npaper, we reveal that one-to-one mappings may overlook the complex relationship\nbetween pretrained and downstream labels. Motivated by this observation, we\npropose a Bayesian-guided Label Mapping (BLM) method. BLM constructs an\niteratively-updated probabilistic label mapping matrix, with each element\nquantifying a pairwise relationship between pretrained and downstream labels.\nThe assignment of values to the constructed matrix is guided by Bayesian\nconditional probability, considering the joint distribution of the downstream\nlabels and the labels predicted by the pretrained model on downstream samples.\nExperiments conducted on both pretrained vision models (e.g., ResNeXt) and\nvision-language models (e.g., CLIP) demonstrate the superior performance of BLM\nover existing label mapping methods. The success of BLM also offers a\nprobabilistic lens through which to understand and analyze the effectiveness of\nVR. Our code is available at https://github.com/tmlr-group/BayesianLM.\n","authors":["Chengyi Cai","Zesheng Ye","Lei Feng","Jianzhong Qi","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.24018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24015v1","updated":"2024-10-31T15:17:14Z","published":"2024-10-31T15:17:14Z","title":"Unveiling Synthetic Faces: How Synthetic Datasets Can Expose Real\n Identities","summary":" Synthetic data generation is gaining increasing popularity in different\ncomputer vision applications. Existing state-of-the-art face recognition models\nare trained using large-scale face datasets, which are crawled from the\nInternet and raise privacy and ethical concerns. To address such concerns,\nseveral works have proposed generating synthetic face datasets to train face\nrecognition models. However, these methods depend on generative models, which\nare trained on real face images. In this work, we design a simple yet effective\nmembership inference attack to systematically study if any of the existing\nsynthetic face recognition datasets leak any information from the real data\nused to train the generator model. We provide an extensive study on 6\nstate-of-the-art synthetic face recognition datasets, and show that in all\nthese synthetic datasets, several samples from the original real dataset are\nleaked. To our knowledge, this paper is the first work which shows the leakage\nfrom training data of generator models into the generated synthetic face\nrecognition datasets. Our study demonstrates privacy pitfalls in synthetic face\nrecognition datasets and paves the way for future studies on generating\nresponsible synthetic face datasets.\n","authors":["Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2410.24015v1.pdf","comment":"Accepted in NeurIPS 2024 Workshop on New Frontiers in Adversarial\n Machine Learning"},{"id":"http://arxiv.org/abs/2410.24010v1","updated":"2024-10-31T15:10:38Z","published":"2024-10-31T15:10:38Z","title":"Re-assembling the past: The RePAIR dataset and benchmark for real world\n 2D and 3D puzzle solving","summary":" This paper proposes the RePAIR dataset that represents a challenging\nbenchmark to test modern computational and data driven methods for\npuzzle-solving and reassembly tasks. Our dataset has unique properties that are\nuncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and\nfractures are realistic, caused by a collapse of a fresco during a World War II\nbombing at the Pompeii archaeological park. The fragments are also eroded and\nhave missing pieces with irregular shapes and different dimensions, challenging\nfurther the reassembly algorithms. The dataset is multi-modal providing high\nresolution images with characteristic pictorial elements, detailed 3D scans of\nthe fragments and meta-data annotated by the archaeologists. Ground truth has\nbeen generated through several years of unceasing fieldwork, including the\nexcavation and cleaning of each fragment, followed by manual puzzle solving by\narchaeologists of a subset of approx. 1000 pieces among the 16000 available.\nAfter digitizing all the fragments in 3D, a benchmark was prepared to challenge\ncurrent reassembly and puzzle-solving methods that often solve more simplistic\nsynthetic scenarios. The tested baselines show that there clearly exists a gap\nto fill in solving this computationally complex problem.\n","authors":["Theodore Tsesmelis","Luca Palmieri","Marina Khoroshiltseva","Adeela Islam","Gur Elkin","Ofir Itzhak Shahar","Gianluca Scarpellini","Stefano Fiorini","Yaniv Ohayon","Nadav Alali","Sinem Aslan","Pietro Morerio","Sebastiano Vascon","Elena Gravina","Maria Cristina Napolitano","Giuseppe Scarpati","Gabriel Zuchtriegel","Alexandra Spühler","Michel E. Fuchs","Stuart James","Ohad Ben-Shahar","Marcello Pelillo","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2410.24010v1.pdf","comment":"NeurIPS 2024, Track Datasets and Benchmarks, 10 pages"},{"id":"http://arxiv.org/abs/2410.24006v1","updated":"2024-10-31T15:09:36Z","published":"2024-10-31T15:09:36Z","title":"DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination","summary":" In the ever-evolving adversarial machine learning landscape, developing\neffective defenses against patch attacks has become a critical challenge,\nnecessitating reliable solutions to safeguard real-world AI systems. Although\ndiffusion models have shown remarkable capacity in image synthesis and have\nbeen recently utilized to counter $\\ell_p$-norm bounded attacks, their\npotential in mitigating localized patch attacks remains largely underexplored.\nIn this work, we propose DiffPAD, a novel framework that harnesses the power of\ndiffusion models for adversarial patch decontamination. DiffPAD first performs\nsuper-resolution restoration on downsampled input images, then adopts\nbinarization, dynamic thresholding scheme and sliding window for effective\nlocalization of adversarial patches. Such a design is inspired by the\ntheoretically derived correlation between patch size and diffusion restoration\nerror that is generalized across diverse patch attack scenarios. Finally,\nDiffPAD applies inpainting techniques to the original input images with the\nestimated patch region being masked. By integrating closed-form solutions for\nsuper-resolution restoration and image inpainting into the conditional reverse\nsampling process of a pre-trained diffusion model, DiffPAD obviates the need\nfor text guidance or fine-tuning. Through comprehensive experiments, we\ndemonstrate that DiffPAD not only achieves state-of-the-art adversarial\nrobustness against patch attacks but also excels in recovering naturalistic\nimages without patch remnants.\n","authors":["Jia Fu","Xiao Zhang","Sepideh Pashami","Fatemeh Rahimian","Anders Holst"],"pdf_url":"https://arxiv.org/pdf/2410.24006v1.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2405.10723v2","updated":"2024-10-31T15:05:50Z","published":"2024-05-17T12:11:58Z","title":"Eddeep: Fast eddy-current distortion correction for diffusion MRI with\n deep learning","summary":" Modern diffusion MRI sequences commonly acquire a large number of volumes\nwith diffusion sensitization gradients of differing strengths or directions.\nSuch sequences rely on echo-planar imaging (EPI) to achieve reasonable scan\nduration. However, EPI is vulnerable to off-resonance effects, leading to\ntissue susceptibility and eddy-current induced distortions. The latter is\nparticularly problematic because it causes misalignment between volumes,\ndisrupting downstream modelling and analysis. The essential correction of eddy\ndistortions is typically done post-acquisition, with image registration.\nHowever, this is non-trivial because correspondence between volumes can be\nseverely disrupted due to volume-specific signal attenuations induced by\nvarying directions and strengths of the applied gradients. This challenge has\nbeen successfully addressed by the popular FSL~Eddy tool but at considerable\ncomputational cost. We propose an alternative approach, leveraging recent\nadvances in image processing enabled by deep learning (DL). It consists of two\nconvolutional neural networks: 1) An image translator to restore correspondence\nbetween images; 2) A registration model to align the translated images. Results\ndemonstrate comparable distortion estimates to FSL~Eddy, while requiring only\nmodest training sample sizes. This work, to the best of our knowledge, is the\nfirst to tackle this problem with deep learning. Together with recently\ndeveloped DL-based susceptibility correction techniques, they pave the way for\nreal-time preprocessing of diffusion MRI, facilitating its wider uptake in the\nclinic.\n","authors":["Antoine Legouhy","Ross Callaghan","Whitney Stee","Philippe Peigneux","Hojjat Azadbakht","Hui Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.10723v2.pdf","comment":"accepted in MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2410.24002v1","updated":"2024-10-31T15:02:16Z","published":"2024-10-31T15:02:16Z","title":"Assessing the Efficacy of Classical and Deep Neuroimaging Biomarkers in\n Early Alzheimer's Disease Diagnosis","summary":" Alzheimer's disease (AD) is the leading cause of dementia, and its early\ndetection is crucial for effective intervention, yet current diagnostic methods\noften fall short in sensitivity and specificity. This study aims to detect\nsignificant indicators of early AD by extracting and integrating various\nimaging biomarkers, including radiomics, hippocampal texture descriptors,\ncortical thickness measurements, and deep learning features. We analyze\nstructural magnetic resonance imaging (MRI) scans from the Alzheimer's Disease\nNeuroimaging Initiative (ADNI) cohorts, utilizing comprehensive image analysis\nand machine learning techniques. Our results show that combining multiple\nbiomarkers significantly improves detection accuracy. Radiomics and texture\nfeatures emerged as the most effective predictors for early AD, achieving AUCs\nof 0.88 and 0.72 for AD and MCI detection, respectively. Although deep learning\nfeatures proved to be less effective than traditional approaches, incorporating\nage with other biomarkers notably enhanced MCI detection performance.\nAdditionally, our findings emphasize the continued importance of classical\nimaging biomarkers in the face of modern deep-learning approaches, providing a\nrobust framework for early AD diagnosis.\n","authors":["Milla E. Nielsen","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2410.24002v1.pdf","comment":"SPIE Medical Imaging (MI25)"},{"id":"http://arxiv.org/abs/2410.24001v1","updated":"2024-10-31T15:02:05Z","published":"2024-10-31T15:02:05Z","title":"ImOV3D: Learning Open-Vocabulary Point Clouds 3D Object Detection from\n Only 2D Images","summary":" Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the\nlimited number of base categories labeled during the training phase. The\nbiggest bottleneck is the scarcity of annotated 3D data, whereas 2D image\ndatasets are abundant and richly annotated. Consequently, it is intuitive to\nleverage the wealth of annotations in 2D images to alleviate the inherent data\nscarcity in OV-3Det. In this paper, we push the task setup to its limits by\nexploring the potential of using solely 2D images to learn OV-3Det. The major\nchallenges for this setup is the modality gap between training images and\ntesting point clouds, which prevents effective integration of 2D knowledge into\nOV-3Det. To address this challenge, we propose a novel framework ImOV3D to\nleverage pseudo multimodal representation containing both images and point\nclouds (PC) to close the modality gap. The key of ImOV3D lies in flexible\nmodality conversion where 2D images can be lifted into 3D using monocular depth\nestimation and can also be derived from 3D scenes through rendering. This\nallows unifying both training images and testing point clouds into a common\nimage-PC representation, encompassing a wealth of 2D semantic information and\nalso incorporating the depth and structural characteristics of 3D spatial data.\nWe carefully conduct such conversion to minimize the domain gap between\ntraining and test cases. Extensive experiments on two benchmark datasets,\nSUNRGBD and ScanNet, show that ImOV3D significantly outperforms existing\nmethods, even in the absence of ground truth 3D training data. With the\ninclusion of a minimal amount of real 3D data for fine-tuning, the performance\nalso significantly surpasses previous state-of-the-art. Codes and pre-trained\nmodels are released on the https://github.com/yangtiming/ImOV3D.\n","authors":["Timing Yang","Yuanliang Ju","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2410.24001v1.pdf","comment":"Accepted by NeurIPS 2024. Code link\n https://github.com/yangtiming/ImOV3D"},{"id":"http://arxiv.org/abs/2211.15656v3","updated":"2024-10-31T15:01:41Z","published":"2022-11-28T18:59:02Z","title":"SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map\n Generation","summary":" High-definition (HD) semantic map generation of the environment is an\nessential component of autonomous driving. Existing methods have achieved good\nperformance in this task by fusing different sensor modalities, such as LiDAR\nand camera. However, current works are based on raw data or network\nfeature-level fusion and only consider short-range HD map generation, limiting\ntheir deployment to realistic autonomous driving applications. In this paper,\nwe focus on the task of building the HD maps in both short ranges, i.e., within\n30 m, and also predicting long-range HD maps up to 90 m, which is required by\ndownstream path planning and control tasks to improve the smoothness and safety\nof autonomous driving. To this end, we propose a novel network named\nSuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels.\nWe use LiDAR depth to improve image depth estimation and use image features to\nguide long-range LiDAR feature prediction. We benchmark our SuperFusion on the\nnuScenes dataset and a self-recorded dataset and show that it outperforms the\nstate-of-the-art baseline methods with large margins on all intervals.\nAdditionally, we apply the generated HD map to a downstream path planning task,\ndemonstrating that the long-range HD maps predicted by our method can lead to\nbetter path planning for autonomous vehicles. Our code has been released at\nhttps://github.com/haomo-ai/SuperFusion.\n","authors":["Hao Dong","Weihao Gu","Xianjing Zhang","Jintao Xu","Rui Ai","Huimin Lu","Juho Kannala","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2211.15656v3.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2407.03550v2","updated":"2024-10-31T14:51:32Z","published":"2024-07-04T00:07:50Z","title":"CoMix: A Comprehensive Benchmark for Multi-Task Comic Understanding","summary":" The comic domain is rapidly advancing with the development of single-page\nanalysis and synthesis models. However, evaluation metrics and datasets lag\nbehind, often limited to small-scale or single-style test sets. We introduce a\nnovel benchmark, CoMix, designed to evaluate the multi-task capabilities of\nmodels in comic analysis. Unlike existing benchmarks that focus on isolated\ntasks such as object detection or text recognition, CoMix addresses a broader\nrange of tasks including object detection, speaker identification, character\nre-identification, reading order, and multi-modal reasoning tasks like\ncharacter naming and dialogue generation. Our benchmark comprises three\nexisting datasets with expanded annotations to support multi-task evaluation.\nTo mitigate the over-representation of manga-style data, we have incorporated a\nnew dataset of carefully selected American comic-style books, thereby enriching\nthe diversity of comic styles. CoMix is designed to assess pre-trained models\nin zero-shot and limited fine-tuning settings, probing their transfer\ncapabilities across different comic styles and tasks. The validation split of\nthe benchmark is publicly available for research purposes, and an evaluation\nserver for the held-out test split is also provided. Comparative results\nbetween human performance and state-of-the-art models reveal a significant\nperformance gap, highlighting substantial opportunities for advancements in\ncomic understanding. The dataset, baseline models, and code are accessible at\nhttps://github.com/emanuelevivoli/CoMix-dataset. This initiative sets a new\nstandard for comprehensive comic analysis, providing the community with a\ncommon benchmark for evaluation on a large and varied set.\n","authors":["Emanuele Vivoli","Marco Bertini","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2407.03550v2.pdf","comment":"Accepted at NeurIPS 2024 (D&B)"},{"id":"http://arxiv.org/abs/2410.23991v1","updated":"2024-10-31T14:50:48Z","published":"2024-10-31T14:50:48Z","title":"Localization, balance and affinity: a stronger multifaceted\n collaborative salient object detector in remote sensing images","summary":" Despite significant advancements in salient object detection(SOD) in optical\nremote sensing images(ORSI), challenges persist due to the intricate edge\nstructures of ORSIs and the complexity of their contextual relationships.\nCurrent deep learning approaches encounter difficulties in accurately\nidentifying boundary features and lack efficiency in collaboratively modeling\nthe foreground and background by leveraging contextual features. To address\nthese challenges, we propose a stronger multifaceted collaborative salient\nobject detector in ORSIs, termed LBA-MCNet, which incorporates aspects of\nlocalization, balance, and affinity. The network focuses on accurately locating\ntargets, balancing detailed features, and modeling image-level global context\ninformation. Specifically, we design the Edge Feature Adaptive Balancing and\nAdjusting(EFABA) module for precise edge localization, using edge features to\nguide attention to boundaries and preserve spatial details. Moreover, we design\nthe Global Distributed Affinity Learning(GDAL) module to model global context.\nIt captures global context by generating an affinity map from the encoders\nfinal layer, ensuring effective modeling of global patterns. Additionally, deep\nsupervision during deconvolution further enhances feature representation.\nFinally, we compared with 28 state of the art approaches on three publicly\navailable datasets. The results clearly demonstrate the superiority of our\nmethod.\n","authors":["Yakun Xie","Suning Liu","Hongyu Chen","Shaohan Cao","Huixin Zhang","Dejun Feng","Qian Wan","Jun Zhu","Qing Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.23991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15856v2","updated":"2024-10-31T14:48:23Z","published":"2023-12-26T02:50:42Z","title":"SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance\n Fields","summary":" Although significant progress has been made in the field of 2D-based\ninteractive editing, fine-grained 3D-based interactive editing remains\nrelatively unexplored. This limitation can be attributed to two main\nchallenges: the lack of an efficient 3D representation robust to different\nmodifications and the absence of an effective 3D interactive segmentation\nmethod. In this paper, we introduce a novel fine-grained interactive 3D\nsegmentation and editing algorithm with radiance fields, which we refer to as\nSERF. Our method entails creating a neural mesh representation by integrating\nmulti-view algorithms with pre-trained 2D models. Building upon this\nrepresentation, we introduce a novel surface rendering technique that preserves\nlocal information and is robust to deformation. Moreover, this representation\nforms the basis for achieving accurate and interactive 3D segmentation without\nrequiring 3D supervision. Harnessing this representation facilitates a range of\ninteractive 3D editing operations, encompassing tasks such as interactive\ngeometry editing and texture painting. Extensive experiments and visualization\nexamples of editing on both real and synthetic data demonstrate the superiority\nof our method on representation quality and editing ability.\n","authors":["Kaichen Zhou","Lanqing Hong","Enze Xie","Yongxin Yang","Zhenguo Li","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22629v2","updated":"2024-10-31T14:44:44Z","published":"2024-10-30T01:22:37Z","title":"CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable\n Remote Sensing Semantic Segmentation","summary":" The field of Remote Sensing Domain Generalization (RSDG) has emerged as a\ncritical and valuable research frontier, focusing on developing models that\ngeneralize effectively across diverse scenarios. Despite the substantial domain\ngaps in RS images that are characterized by variabilities such as location,\nwavelength, and sensor type, research in this area remains underexplored: (1)\nCurrent cross-domain methods primarily focus on Domain Adaptation (DA), which\nadapts models to predefined domains rather than to unseen ones; (2) Few studies\ntargeting the RSDG issue, especially for semantic segmentation tasks, where\nexisting models are developed for specific unknown domains, struggling with\nissues of underfitting on other unknown scenarios; (3) Existing RS foundation\nmodels tend to prioritize in-domain performance over cross-domain\ngeneralization. To this end, we introduce the first vision foundation model for\nRSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong\ncross-domain generalization through a specially designed data-level Earth-Style\nInjection pipeline and a model-level Multi-Task Training pipeline. In addition,\nfor the semantic segmentation task, we have curated an RSDG benchmark\ncomprising 28 cross-domain settings across various regions, spectral bands,\nplatforms, and climates, providing a comprehensive framework for testing the\ngeneralizability of future RSDG models. Extensive experiments on this benchmark\ndemonstrate the superiority of CrossEarth over existing state-of-the-art\nmethods.\n","authors":["Ziyang Gong","Zhixiang Wei","Di Wang","Xianzheng Ma","Hongruixuan Chen","Yuru Jia","Yupeng Deng","Zhenming Ji","Xiangwei Zhu","Naoto Yokoya","Jing Zhang","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.22629v2.pdf","comment":"The codes and models will be available at\n https://github.com/Cuzyoung/CrossEarth"},{"id":"http://arxiv.org/abs/2410.23988v1","updated":"2024-10-31T14:42:26Z","published":"2024-10-31T14:42:26Z","title":"JEMA: A Joint Embedding Framework for Scalable Co-Learning with\n Multimodal Alignment","summary":" This work introduces JEMA (Joint Embedding with Multimodal Alignment), a\nnovel co-learning framework tailored for laser metal deposition (LMD), a\npivotal process in metal additive manufacturing. As Industry 5.0 gains traction\nin industrial applications, efficient process monitoring becomes increasingly\ncrucial. However, limited data and the opaque nature of AI present challenges\nfor its application in an industrial setting. JEMA addresses this challenges by\nleveraging multimodal data, including multi-view images and metadata such as\nprocess parameters, to learn transferable semantic representations. By applying\na supervised contrastive loss function, JEMA enables robust learning and\nsubsequent process monitoring using only the primary modality, simplifying\nhardware requirements and computational overhead. We investigate the\neffectiveness of JEMA in LMD process monitoring, focusing specifically on its\ngeneralization to downstream tasks such as melt pool geometry prediction,\nachieved without extensive fine-tuning. Our empirical evaluation demonstrates\nthe high scalability and performance of JEMA, particularly when combined with\nVision Transformer models. We report an 8% increase in performance in\nmultimodal settings and a 1% improvement in unimodal settings compared to\nsupervised contrastive learning. Additionally, the learned embedding\nrepresentation enables the prediction of metadata, enhancing interpretability\nand making possible the assessment of the added metadata's contributions. Our\nframework lays the foundation for integrating multisensor data with metadata,\nenabling diverse downstream tasks within the LMD domain and beyond.\n","authors":["Joao Sousa","Roya Darabi","Armando Sousa","Frank Brueckner","Luís Paulo Reis","Ana Reis"],"pdf_url":"https://arxiv.org/pdf/2410.23988v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2407.12582v2","updated":"2024-10-31T14:37:42Z","published":"2024-07-17T14:09:46Z","title":"Embracing Events and Frames with Hierarchical Feature Refinement Network\n for Object Detection","summary":" In frame-based vision, object detection faces substantial performance\ndegradation under challenging conditions due to the limited sensing capability\nof conventional cameras. Event cameras output sparse and asynchronous events,\nproviding a potential solution to solve these problems. However, effectively\nfusing two heterogeneous modalities remains an open issue. In this work, we\npropose a novel hierarchical feature refinement network for event-frame fusion.\nThe core concept is the design of the coarse-to-fine fusion module, denoted as\nthe cross-modality adaptive feature refinement (CAFR) module. In the initial\nphase, the bidirectional cross-modality interaction (BCI) part facilitates\ninformation bridging from two distinct sources. Subsequently, the features are\nfurther refined by aligning the channel-level mean and variance in the two-fold\nadaptive feature refinement (TAFR) part. We conducted extensive experiments on\ntwo benchmarks: the low-resolution PKU-DDD17-Car dataset and the\nhigh-resolution DSEC dataset. Experimental results show that our method\nsurpasses the state-of-the-art by an impressive margin of $\\textbf{8.0}\\%$ on\nthe DSEC dataset. Besides, our method exhibits significantly better robustness\n(\\textbf{69.5}\\% versus \\textbf{38.7}\\%) when introducing 15 different\ncorruption types to the frame images. The code can be found at the link\n(https://github.com/HuCaoFighting/FRN).\n","authors":["Hu Cao","Zehua Zhang","Yan Xia","Xinyi Li","Jiahao Xia","Guang Chen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2407.12582v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2410.22637v2","updated":"2024-10-31T14:35:31Z","published":"2024-10-30T02:04:23Z","title":"Consistency Diffusion Bridge Models","summary":" Diffusion models (DMs) have become the dominant paradigm of generative\nmodeling in a variety of domains by learning stochastic processes from noise to\ndata. Recently, diffusion denoising bridge models (DDBMs), a new formulation of\ngenerative modeling that builds stochastic processes between fixed data\nendpoints based on a reference diffusion process, have achieved empirical\nsuccess across tasks with coupled data distribution, such as image-to-image\ntranslation. However, DDBM's sampling process typically requires hundreds of\nnetwork evaluations to achieve decent performance, which may impede their\npractical deployment due to high computational demands. In this work, inspired\nby the recent advance of consistency models in DMs, we tackle this problem by\nlearning the consistency function of the probability-flow ordinary differential\nequation (PF-ODE) of DDBMs, which directly predicts the solution at a starting\nstep given any point on the ODE trajectory. Based on a dedicated general-form\nODE solver, we propose two paradigms: consistency bridge distillation and\nconsistency bridge training, which is flexible to apply on DDBMs with broad\ndesign choices. Experimental results show that our proposed method could sample\n$4\\times$ to $50\\times$ faster than the base DDBM and produce better visual\nquality given the same step in various tasks with pixel resolution ranging from\n$64 \\times 64$ to $256 \\times 256$, as well as supporting downstream tasks such\nas semantic interpolation in the data space.\n","authors":["Guande He","Kaiwen Zheng","Jianfei Chen","Fan Bao","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.22637v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23970v1","updated":"2024-10-31T14:25:55Z","published":"2024-10-31T14:25:55Z","title":"TrAct: Making First-layer Pre-Activations Trainable","summary":" We consider the training of the first layer of vision models and notice the\nclear relationship between pixel values and gradient update magnitudes: the\ngradients arriving at the weights of a first layer are by definition directly\nproportional to (normalized) input pixel values. Thus, an image with low\ncontrast has a smaller impact on learning than an image with higher contrast,\nand a very bright or very dark image has a stronger impact on the weights than\nan image with moderate brightness. In this work, we propose performing gradient\ndescent on the embeddings produced by the first layer of the model. However,\nswitching to discrete inputs with an embedding layer is not a reasonable option\nfor vision models. Thus, we propose the conceptual procedure of (i) a gradient\ndescent step on first layer activations to construct an activation proposal,\nand (ii) finding the optimal weights of the first layer, i.e., those weights\nwhich minimize the squared distance to the activation proposal. We provide a\nclosed form solution of the procedure and adjust it for robust stochastic\ntraining while computing everything efficiently. Empirically, we find that\nTrAct (Training Activations) speeds up training by factors between 1.25x and 4x\nwhile requiring only a small computational overhead. We demonstrate the utility\nof TrAct with different optimizers for a range of different vision models\nincluding convolutional and transformer architectures.\n","authors":["Felix Petersen","Christian Borgelt","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2410.23970v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23962v1","updated":"2024-10-31T14:14:30Z","published":"2024-10-31T14:14:30Z","title":"Image Synthesis with Class-Aware Semantic Diffusion Models for Surgical\n Scene Segmentation","summary":" Surgical scene segmentation is essential for enhancing surgical precision,\nyet it is frequently compromised by the scarcity and imbalance of available\ndata. To address these challenges, semantic image synthesis methods based on\ngenerative adversarial networks and diffusion models have been developed.\nHowever, these models often yield non-diverse images and fail to capture small,\ncritical tissue classes, limiting their effectiveness. In response, we propose\nthe Class-Aware Semantic Diffusion Model (CASDM), a novel approach which\nutilizes segmentation maps as conditions for image synthesis to tackle data\nscarcity and imbalance. Novel class-aware mean squared error and class-aware\nself-perceptual loss functions have been defined to prioritize critical, less\nvisible classes, thereby enhancing image quality and relevance. Furthermore, to\nour knowledge, we are the first to generate multi-class segmentation maps using\ntext prompts in a novel fashion to specify their contents. These maps are then\nused by CASDM to generate surgical scene images, enhancing datasets for\ntraining and validating segmentation models. Our evaluation, which assesses\nboth image quality and downstream segmentation performance, demonstrates the\nstrong effectiveness and generalisability of CASDM in producing realistic\nimage-map pairs, significantly advancing surgical scene segmentation across\ndiverse and challenging datasets.\n","authors":["Yihang Zhou","Rebecca Towning","Zaid Awad","Stamatia Giannarou"],"pdf_url":"https://arxiv.org/pdf/2410.23962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23946v1","updated":"2024-10-31T14:02:40Z","published":"2024-10-31T14:02:40Z","title":"MV-CC: Mask Enhanced Video Model for Remote Sensing Change Caption","summary":" Remote sensing image change caption (RSICC) aims to provide natural language\ndescriptions for bi-temporal remote sensing images. Since Change Caption (CC)\ntask requires both spatial and temporal features, previous works follow an\nencoder-fusion-decoder architecture. They use an image encoder to extract\nspatial features and the fusion module to integrate spatial features and\nextract temporal features, which leads to increasingly complex manual design of\nthe fusion module. In this paper, we introduce a novel video model-based\nparadigm without design of the fusion module and propose a Mask-enhanced Video\nmodel for Change Caption (MV-CC). Specifically, we use the off-the-shelf video\nencoder to simultaneously extract the temporal and spatial features of\nbi-temporal images. Furthermore, the types of changes in the CC are set based\non specific task requirements, and to enable the model to better focus on the\nregions of interest, we employ masks obtained from the Change Detection (CD)\nmethod to explicitly guide the CC model. Experimental results demonstrate that\nour proposed method can obtain better performance compared with other\nstate-of-the-art RSICC methods. The code is available at\nhttps://github.com/liuruixun/MV-CC.\n","authors":["Ruixun Liu","Kaiyu Li","Jiayi Song","Dongwei Sun","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2410.23946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03438v2","updated":"2024-10-31T13:41:19Z","published":"2024-10-04T13:52:22Z","title":"Dessie: Disentanglement for Articulated 3D Horse Shape and Pose\n Estimation from Images","summary":" In recent years, 3D parametric animal models have been developed to aid in\nestimating 3D shape and pose from images and video. While progress has been\nmade for humans, it's more challenging for animals due to limited annotated\ndata. To address this, we introduce the first method using synthetic data\ngeneration and disentanglement to learn to regress 3D shape and pose. Focusing\non horses, we use text-based texture generation and a synthetic data pipeline\nto create varied shapes, poses, and appearances, learning disentangled spaces.\nOur method, Dessie, surpasses existing 3D horse reconstruction methods and\ngeneralizes to other large animals like zebras, cows, and deer. See the project\nwebsite at: \\url{https://celiali.github.io/Dessie/}.\n","authors":["Ci Li","Yi Yang","Zehang Weng","Elin Hernlund","Silvia Zuffi","Hedvig Kjellström"],"pdf_url":"https://arxiv.org/pdf/2410.03438v2.pdf","comment":"ACCV2024"},{"id":"http://arxiv.org/abs/2410.23931v1","updated":"2024-10-31T13:41:16Z","published":"2024-10-31T13:41:16Z","title":"Manipulating Vehicle 3D Shapes through Latent Space Editing","summary":" Although 3D object editing has the potential to significantly influence\nvarious industries, recent research in 3D generation and editing has primarily\nfocused on converting text and images into 3D models, often overlooking the\nneed for fine-grained control over the editing of existing 3D objects. This\npaper introduces a framework that employs a pre-trained regressor, enabling\ncontinuous, precise, attribute-specific modifications to both the stylistic and\ngeometric attributes of vehicle 3D models. Our method not only preserves the\ninherent identity of vehicle 3D objects, but also supports multi-attribute\nediting, allowing for extensive customization without compromising the model's\nstructural integrity. Experimental results demonstrate the efficacy of our\napproach in achieving detailed edits on various vehicle 3D models.\n","authors":["JiangDong Miao","Tatsuya Ikeda","Bisser Raytchev","Ryota Mizoguchi","Takenori Hiraoka","Takuji Nakashima","Keigo Shimizu","Toru Higaki","Kazufumi Kaneda"],"pdf_url":"https://arxiv.org/pdf/2410.23931v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2410.23918v1","updated":"2024-10-31T13:26:11Z","published":"2024-10-31T13:26:11Z","title":"BitStack: Fine-Grained Size Control for Compressed Large Language Models\n in Variable Memory Environments","summary":" Large language models (LLMs) have revolutionized numerous applications, yet\ntheir deployment remains challenged by memory constraints on local devices.\nWhile scaling laws have enhanced LLM capabilities, the primary bottleneck has\nshifted from \\textit{capability} to \\textit{availability}, emphasizing the need\nfor efficient memory management. Traditional compression methods, such as\nquantization, often require predefined compression ratios and separate\ncompression processes for each setting, complicating deployment in variable\nmemory environments. In this paper, we introduce \\textbf{BitStack}, a novel,\ntraining-free weight compression approach that enables megabyte-level\ntrade-offs between memory usage and model performance. By leveraging weight\ndecomposition, BitStack can dynamically adjust the model size with minimal\ntransmission between running memory and storage devices. Our approach\niteratively decomposes weight matrices while considering the significance of\neach parameter, resulting in an approximately 1-bit per parameter residual\nblock in each decomposition iteration. These blocks are sorted and stacked in\nstorage as basic transmission units, with different quantities loaded based on\ncurrent memory availability. Extensive experiments across a wide range of tasks\ndemonstrate that, despite offering fine-grained size control, BitStack\nconsistently matches or surpasses strong quantization baselines, particularly\nat extreme compression ratios. To the best of our knowledge, this is the first\ndecomposition-based method that effectively bridges the gap to practical\ncompression techniques like quantization. Code is available at\nhttps://github.com/xinghaow99/BitStack.\n","authors":["Xinghao Wang","Pengyu Wang","Bo Wang","Dong Zhang","Yunhua Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.23918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04230v2","updated":"2024-10-31T13:18:14Z","published":"2024-06-06T16:30:41Z","title":"M3LEO: A Multi-Modal, Multi-Label Earth Observation Dataset Integrating\n Interferometric SAR and Multispectral Data","summary":" Satellite-based remote sensing has revolutionised the way we address global\nchallenges. Huge quantities of Earth Observation (EO) data are generated by\nsatellite sensors daily, but processing these large datasets for use in ML\npipelines is technically and computationally challenging. While some\npreprocessed Earth observation datasets exist, their content is often limited\nto optical or near-optical wavelength data, which is ineffective at night or in\nadverse weather conditions. Synthetic Aperture Radar (SAR), an active sensing\ntechnique based on microwave length radiation, offers a viable alternative.\nHowever, the application of machine learning to SAR has been limited due to a\nlack of ML-ready data and pipelines, particularly for the full diversity of SAR\ndata, including polarimetry, coherence and interferometry. In this work, we\nintroduce M3LEO, a multi-modal, multi-label Earth observation dataset that\nincludes polarimetric, interferometric, and coherence SAR data derived from\nSentinel-1, alongside multispectral Sentinel-2 imagery and auxiliary data\ndescribing terrain properties such as land use. M3LEO spans approximately 17M\n4x4 km data chips from six diverse geographic regions. The dataset is\ncomplemented by a flexible PyTorch Lightning framework configured using Hydra\nto accommodate its use across diverse ML applications in Earth observation. We\nprovide tools to process any dataset available on popular platforms such as\nGoogle Earth Engine for seamless integration with our framework. We show that\nthe distribution shift in self-supervised embeddings is substantial across\ngeographic regions, even when controlling for terrain properties. Data:\nhuggingface.co/M3LEO, Code: github.com/spaceml-org/M3LEO.\n","authors":["Matthew J Allen","Francisco Dorr","Joseph Alejandro Gallego Mejia","Laura Martínez-Ferrer","Anna Jungbluth","Freddie Kalaitzis","Raúl Ramos-Pollán"],"pdf_url":"https://arxiv.org/pdf/2406.04230v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.23910v1","updated":"2024-10-31T13:13:32Z","published":"2024-10-31T13:13:32Z","title":"Uncertainty Estimation for 3D Object Detection via Evidential Learning","summary":" 3D object detection is an essential task for computer vision applications in\nautonomous vehicles and robotics. However, models often struggle to quantify\ndetection reliability, leading to poor performance on unfamiliar scenes. We\nintroduce a framework for quantifying uncertainty in 3D object detection by\nleveraging an evidential learning loss on Bird's Eye View representations in\nthe 3D detector. These uncertainty estimates require minimal computational\noverhead and are generalizable across different architectures. We demonstrate\nboth the efficacy and importance of these uncertainty estimates on identifying\nout-of-distribution scenes, poorly localized objects, and missing (false\nnegative) detections; our framework consistently improves over baselines by\n10-20% on average. Finally, we integrate this suite of tasks into a system\nwhere a 3D object detector auto-labels driving scenes and our uncertainty\nestimates verify label correctness before the labels are used to train a second\nmodel. Here, our uncertainty-driven verification results in a 1% improvement in\nmAP and a 1-2% improvement in NDS.\n","authors":["Nikita Durasov","Rafid Mahmood","Jiwoong Choi","Marc T. Law","James Lucas","Pascal Fua","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2410.23910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23906v1","updated":"2024-10-31T13:11:09Z","published":"2024-10-31T13:11:09Z","title":"From Web Data to Real Fields: Low-Cost Unsupervised Domain Adaptation\n for Agricultural Robots","summary":" In precision agriculture, vision models often struggle with new, unseen\nfields where crops and weeds have been influenced by external factors,\nresulting in compositions and appearances that differ from the learned\ndistribution. This paper aims to adapt to specific fields at low cost using\nUnsupervised Domain Adaptation (UDA). We explore a novel domain shift from a\ndiverse, large pool of internet-sourced data to a small set of data collected\nby a robot at specific locations, minimizing the need for extensive on-field\ndata collection. Additionally, we introduce a novel module -- the Multi-level\nAttention-based Adversarial Discriminator (MAAD) -- which can be integrated at\nthe feature extractor level of any detection model. In this study, we\nincorporate MAAD with CenterNet to simultaneously detect leaf, stem, and vein\ninstances. Our results show significant performance improvements in the\nunlabeled target domain compared to baseline models, with a 7.5% increase in\nobject detection accuracy and a 5.1% improvement in keypoint detection.\n","authors":["Vasileios Tzouras","Lazaros Nalpantidis","Ronja Güldenring"],"pdf_url":"https://arxiv.org/pdf/2410.23906v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2410.23905v1","updated":"2024-10-31T13:10:50Z","published":"2024-10-31T13:10:50Z","title":"Text-DiFuse: An Interactive Multi-Modal Image Fusion Framework based on\n Text-modulated Diffusion Model","summary":" Existing multi-modal image fusion methods fail to address the compound\ndegradations presented in source images, resulting in fusion images plagued by\nnoise, color bias, improper exposure, \\textit{etc}. Additionally, these methods\noften overlook the specificity of foreground objects, weakening the salience of\nthe objects of interest within the fused images. To address these challenges,\nthis study proposes a novel interactive multi-modal image fusion framework\nbased on the text-modulated diffusion model, called Text-DiFuse. First, this\nframework integrates feature-level information integration into the diffusion\nprocess, allowing adaptive degradation removal and multi-modal information\nfusion. This is the first attempt to deeply and explicitly embed information\nfusion within the diffusion process, effectively addressing compound\ndegradation in image fusion. Second, by embedding the combination of the text\nand zero-shot location model into the diffusion fusion process, a\ntext-controlled fusion re-modulation strategy is developed. This enables\nuser-customized text control to improve fusion performance and highlight\nforeground objects in the fused images. Extensive experiments on diverse public\ndatasets show that our Text-DiFuse achieves state-of-the-art fusion performance\nacross various scenarios with complex degradation. Moreover, the semantic\nsegmentation experiment validates the significant enhancement in semantic\nperformance achieved by our text-controlled fusion re-modulation strategy. The\ncode is publicly available at https://github.com/Leiii-Cao/Text-DiFuse.\n","authors":["Hao Zhang","Lei Cao","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2410.23905v1.pdf","comment":"Accepted by the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2303.15124v2","updated":"2024-10-31T13:10:11Z","published":"2023-03-27T11:56:20Z","title":"Blind Inpainting with Object-aware Discrimination for Artificial Marker\n Removal","summary":" Medical images often incorporate doctor-added markers that can hinder\nAI-based diagnosis. This issue highlights the need of inpainting techniques to\nrestore the corrupted visual contents. However, existing methods require manual\nmask annotation as input, limiting the application scenarios. In this paper, we\npropose a novel blind inpainting method that automatically reconstructs visual\ncontents within the corrupted regions without mask input as guidance. Our model\nincludes a blind reconstruction network and an object-aware discriminator for\nadversarial training. The reconstruction network contains two branches that\npredict corrupted regions in images and simultaneously restore the missing\nvisual contents. Leveraging the potent recognition capability of a dense object\ndetector, the object-aware discriminator ensures markers undetectable after\ninpainting. Thus, the restored images closely resemble the clean ones. We\nevaluate our method on three datasets of various medical imaging modalities,\nconfirming better performance over other state-of-the-art methods.\n","authors":["Xuechen Guo","Wenhao Hu","Chiming Ni","Wenhao Chai","Shiyan Li","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23904v1","updated":"2024-10-31T13:06:29Z","published":"2024-10-31T13:06:29Z","title":"EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI\n Detection","summary":" Detecting Human-Object Interactions (HOI) in zero-shot settings, where models\nmust handle unseen classes, poses significant challenges. Existing methods that\nrely on aligning visual encoders with large Vision-Language Models (VLMs) to\ntap into the extensive knowledge of VLMs, require large, computationally\nexpensive models and encounter training difficulties. Adapting VLMs with prompt\nlearning offers an alternative to direct alignment. However, fine-tuning on\ntask-specific datasets often leads to overfitting to seen classes and\nsuboptimal performance on unseen classes, due to the absence of unseen class\nlabels. To address these challenges, we introduce a novel prompt learning-based\nframework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce\nLarge Language Model (LLM) and VLM guidance for learnable prompts, integrating\ndetailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks.\nHowever, because training datasets contain seen-class labels alone, fine-tuning\nVLMs on such datasets tends to optimize learnable prompts for seen classes\ninstead of unseen ones. Therefore, we design prompt learning for unseen classes\nusing information from related seen classes, with LLMs utilized to highlight\nthe differences between unseen and related seen classes. Quantitative\nevaluations on benchmark datasets demonstrate that our EZ-HOI achieves\nstate-of-the-art performance across various zero-shot settings with only 10.35%\nto 33.95% of the trainable parameters compared to existing methods. Code is\navailable at https://github.com/ChelsieLei/EZ-HOI.\n","authors":["Qinqian Lei","Bo Wang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23904v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.04857v4","updated":"2024-10-31T13:01:13Z","published":"2024-02-07T13:54:56Z","title":"Advancing Video Anomaly Detection: A Concise Review and a New Dataset","summary":" Video Anomaly Detection (VAD) finds widespread applications in security\nsurveillance, traffic monitoring, industrial monitoring, and healthcare.\nDespite extensive research efforts, there remains a lack of concise reviews\nthat provide insightful guidance for researchers. Such reviews would serve as\nquick references to grasp current challenges, research trends, and future\ndirections. In this paper, we present such a review, examining models and\ndatasets from various perspectives. We emphasize the critical relationship\nbetween model and dataset, where the quality and diversity of datasets\nprofoundly influence model performance, and dataset development adapts to the\nevolving needs of emerging approaches. Our review identifies practical issues,\nincluding the absence of comprehensive datasets with diverse scenarios. To\naddress this, we introduce a new dataset, Multi-Scenario Anomaly Detection\n(MSAD), comprising 14 distinct scenarios captured from various camera views.\nOur dataset has diverse motion patterns and challenging variations, such as\ndifferent lighting and weather conditions, providing a robust foundation for\ntraining superior models. We conduct an in-depth analysis of recent\nrepresentative models using MSAD and highlight its potential in addressing the\nchallenges of detecting anomalies across diverse and evolving surveillance\nscenarios. [Project website: https://msad-dataset.github.io/]\n","authors":["Liyun Zhu","Lei Wang","Arjun Raj","Tom Gedeon","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04857v4.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2407.08447v2","updated":"2024-10-31T12:58:08Z","published":"2024-07-11T12:41:32Z","title":"WildGaussians: 3D Gaussian Splatting in the Wild","summary":" While the field of 3D scene reconstruction is dominated by NeRFs due to their\nphotorealistic quality, 3D Gaussian Splatting (3DGS) has recently emerged,\noffering similar quality with real-time rendering speeds. However, both methods\nprimarily excel with well-controlled 3D scenes, while in-the-wild data -\ncharacterized by occlusions, dynamic objects, and varying illumination -\nremains challenging. NeRFs can adapt to such conditions easily through\nper-image embedding vectors, but 3DGS struggles due to its explicit\nrepresentation and lack of shared parameters. To address this, we introduce\nWildGaussians, a novel approach to handle occlusions and appearance changes\nwith 3DGS. By leveraging robust DINO features and integrating an appearance\nmodeling module within 3DGS, our method achieves state-of-the-art results. We\ndemonstrate that WildGaussians matches the real-time rendering speed of 3DGS\nwhile surpassing both 3DGS and NeRF baselines in handling in-the-wild data, all\nwithin a simple architectural framework.\n","authors":["Jonas Kulhanek","Songyou Peng","Zuzana Kukelova","Marc Pollefeys","Torsten Sattler"],"pdf_url":"https://arxiv.org/pdf/2407.08447v2.pdf","comment":"NeurIPS 2024; Project page: https://wild-gaussians.github.io/"},{"id":"http://arxiv.org/abs/2410.23891v1","updated":"2024-10-31T12:52:52Z","published":"2024-10-31T12:52:52Z","title":"AllClear: A Comprehensive Dataset and Benchmark for Cloud Removal in\n Satellite Imagery","summary":" Clouds in satellite imagery pose a significant challenge for downstream\napplications. A major challenge in current cloud removal research is the\nabsence of a comprehensive benchmark and a sufficiently large and diverse\ntraining dataset. To address this problem, we introduce the largest public\ndataset -- $\\textit{AllClear}$ for cloud removal, featuring 23,742 globally\ndistributed regions of interest (ROIs) with diverse land-use patterns,\ncomprising 4 million images in total. Each ROI includes complete temporal\ncaptures from the year 2022, with (1) multi-spectral optical imagery from\nSentinel-2 and Landsat 8/9, (2) synthetic aperture radar (SAR) imagery from\nSentinel-1, and (3) auxiliary remote sensing products such as cloud masks and\nland cover maps. We validate the effectiveness of our dataset by benchmarking\nperformance, demonstrating the scaling law -- the PSNR rises from $28.47$ to\n$33.87$ with $30\\times$ more data, and conducting ablation studies on the\ntemporal length and the importance of individual modalities. This dataset aims\nto provide comprehensive coverage of the Earth's surface and promote better\ncloud removal results.\n","authors":["Hangyu Zhou","Chia-Hsiang Kao","Cheng Perng Phoo","Utkarsh Mall","Bharath Hariharan","Kavita Bala"],"pdf_url":"https://arxiv.org/pdf/2410.23891v1.pdf","comment":"Accepted at NeurIPS 2024 Datasets and Benchmarks Track. Code and data\n available at https://allclear.cs.cornell.edu/"},{"id":"http://arxiv.org/abs/2410.10356v2","updated":"2024-10-31T12:49:09Z","published":"2024-10-14T10:17:24Z","title":"FasterDiT: Towards Faster Diffusion Transformers Training without\n Architecture Modification","summary":" Diffusion Transformers (DiT) have attracted significant attention in\nresearch. However, they suffer from a slow convergence rate. In this paper, we\naim to accelerate DiT training without any architectural modification. We\nidentify the following issues in the training process: firstly, certain\ntraining strategies do not consistently perform well across different data.\nSecondly, the effectiveness of supervision at specific timesteps is limited. In\nresponse, we propose the following contributions: (1) We introduce a new\nperspective for interpreting the failure of the strategies. Specifically, we\nslightly extend the definition of Signal-to-Noise Ratio (SNR) and suggest\nobserving the Probability Density Function (PDF) of SNR to understand the\nessence of the data robustness of the strategy. (2) We conduct numerous\nexperiments and report over one hundred experimental results to empirically\nsummarize a unified accelerating strategy from the perspective of PDF. (3) We\ndevelop a new supervision method that further accelerates the training process\nof DiT. Based on them, we propose FasterDiT, an exceedingly simple and\npracticable design strategy. With few lines of code modifications, it achieves\n2.30 FID on ImageNet 256 resolution at 1000k iterations, which is comparable to\nDiT (2.27 FID) but 7 times faster in training.\n","authors":["Jingfeng Yao","Wang Cheng","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.10356v2.pdf","comment":"NeurIPS 2024 (poster); update to camera-ready version"},{"id":"http://arxiv.org/abs/2407.16430v2","updated":"2024-10-31T12:48:05Z","published":"2024-07-23T12:28:59Z","title":"Rethinking Out-of-Distribution Detection on Imbalanced Data Distribution","summary":" Detecting and rejecting unknown out-of-distribution (OOD) samples is critical\nfor deployed neural networks to void unreliable predictions. In real-world\nscenarios, however, the efficacy of existing OOD detection methods is often\nimpeded by the inherent imbalance of in-distribution (ID) data, which causes\nsignificant performance decline. Through statistical observations, we have\nidentified two common challenges faced by different OOD detectors:\nmisidentifying tail class ID samples as OOD, while erroneously predicting OOD\nsamples as head class from ID. To explain this phenomenon, we introduce a\ngeneralized statistical framework, termed ImOOD, to formulate the OOD detection\nproblem on imbalanced data distribution. Consequently, the theoretical analysis\nreveals that there exists a class-aware bias item between balanced and\nimbalanced OOD detection, which contributes to the performance gap. Building\nupon this finding, we present a unified training-time regularization technique\nto mitigate the bias and boost imbalanced OOD detectors across architecture\ndesigns. Our theoretically grounded method translates into consistent\nimprovements on the representative CIFAR10-LT, CIFAR100-LT, and ImageNet-LT\nbenchmarks against several state-of-the-art OOD detection approaches. Code is\navailable at https://github.com/alibaba/imood.\n","authors":["Kai Liu","Zhihang Fu","Sheng Jin","Chao Chen","Ze Chen","Rongxin Jiang","Fan Zhou","Yaowu Chen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2407.16430v2.pdf","comment":"This paper has been accepted by NeurIPS 2024. Code is available at\n https://github.com/alibaba/imood"},{"id":"http://arxiv.org/abs/2408.12282v2","updated":"2024-10-31T12:30:46Z","published":"2024-08-22T10:34:01Z","title":"Subsurface Scattering for 3D Gaussian Splatting","summary":" 3D reconstruction and relighting of objects made from scattering materials\npresent a significant challenge due to the complex light transport beneath the\nsurface. 3D Gaussian Splatting introduced high-quality novel view synthesis at\nreal-time speeds. While 3D Gaussians efficiently approximate an object's\nsurface, they fail to capture the volumetric properties of subsurface\nscattering. We propose a framework for optimizing an object's shape together\nwith the radiance transfer field given multi-view OLAT (one light at a time)\ndata. Our method decomposes the scene into an explicit surface represented as\n3D Gaussians, with a spatially varying BRDF, and an implicit volumetric\nrepresentation of the scattering component. A learned incident light field\naccounts for shadowing. We optimize all parameters jointly via ray-traced\ndifferentiable rendering. Our approach enables material editing, relighting and\nnovel view synthesis at interactive rates. We show successful application on\nsynthetic data and introduce a newly acquired multi-view multi-light dataset of\nobjects in a light-stage setup. Compared to previous work we achieve comparable\nor better results at a fraction of optimization and rendering time while\nenabling detailed control over material attributes. Project page\nhttps://sss.jdihlmann.com/\n","authors":["Jan-Niklas Dihlmann","Arjun Majumdar","Andreas Engelhardt","Raphael Braun","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2408.12282v2.pdf","comment":"Project page: https://sss.jdihlmann.com/"},{"id":"http://arxiv.org/abs/2405.15688v2","updated":"2024-10-31T12:24:34Z","published":"2024-05-24T16:27:05Z","title":"UNION: Unsupervised 3D Object Detection using Object Appearance-based\n Pseudo-Classes","summary":" Unsupervised 3D object detection methods have emerged to leverage vast\namounts of data without requiring manual labels for training. Recent approaches\nrely on dynamic objects for learning to detect mobile objects but penalize the\ndetections of static instances during training. Multiple rounds of (self)\ntraining are used to add detected static instances to the set of training\ntargets; this procedure to improve performance is computationally expensive. To\naddress this, we propose the method UNION. We use spatial clustering and\nself-supervised scene flow to obtain a set of static and dynamic object\nproposals from LiDAR. Subsequently, object proposals' visual appearances are\nencoded to distinguish static objects in the foreground and background by\nselecting static instances that are visually similar to dynamic objects. As a\nresult, static and dynamic mobile objects are obtained together, and existing\ndetectors can be trained with a single training. In addition, we extend 3D\nobject discovery to detection by using object appearance-based cluster labels\nas pseudo-class labels for training object classification. We conduct extensive\nexperiments on the nuScenes dataset and increase the state-of-the-art\nperformance for unsupervised 3D object discovery, i.e. UNION more than doubles\nthe average precision to 38.4. The code is available at\ngithub.com/TedLentsch/UNION.\n","authors":["Ted Lentsch","Holger Caesar","Dariu M. Gavrila"],"pdf_url":"https://arxiv.org/pdf/2405.15688v2.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2312.10112v3","updated":"2024-10-31T12:19:37Z","published":"2023-12-15T09:09:25Z","title":"NM-FlowGAN: Modeling sRGB Noise without Paired Images using a Hybrid\n Approach of Normalizing Flows and GAN","summary":" Modeling and synthesizing real sRGB noise is crucial for various low-level\nvision tasks, such as building datasets for training image denoising systems.\nThe distribution of real sRGB noise is highly complex and affected by a\nmultitude of factors, making its accurate modeling extremely challenging.\nTherefore, recent studies have proposed methods that employ data-driven\ngenerative models, such as Generative Adversarial Networks (GAN) and\nNormalizing Flows. These studies achieve more accurate modeling of sRGB noise\ncompared to traditional noise modeling methods. However, there are performance\nlimitations due to the inherent characteristics of each generative model. To\naddress this issue, we propose NM-FlowGAN, a hybrid approach that exploits the\nstrengths of both GAN and Normalizing Flows. We combine pixel-wise noise\nmodeling networks based on Normalizing Flows and spatial correlation modeling\nnetworks based on GAN. Specifically, the pixel-wise noise modeling network\nleverages the high training stability of Normalizing Flows to capture noise\ncharacteristics that are affected by a multitude of factors, and the spatial\ncorrelation networks efficiently model pixel-to-pixel relationships. In\nparticular, unlike recent methods that rely on paired noisy images, our method\nsynthesizes noise using clean images and factors that affect noise\ncharacteristics, such as easily obtainable parameters like camera type and ISO\nsettings, making it applicable to various fields where obtaining noisy-clean\nimage pairs is not feasible. In our experiments, our NM-FlowGAN outperforms\nother baselines in the sRGB noise synthesis task. Moreover, the denoising\nneural network trained with synthesized image pairs from our model shows\nsuperior performance compared to other baselines. Our code is available at:\n\\url{https://github.com/YoungJooHan/NM-FlowGAN}.\n","authors":["Young Joo Han","Ha-Jin Yu"],"pdf_url":"https://arxiv.org/pdf/2312.10112v3.pdf","comment":"13 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.16022v2","updated":"2024-10-31T12:17:39Z","published":"2024-04-24T17:55:33Z","title":"PuLID: Pure and Lightning ID Customization via Contrastive Alignment","summary":" We propose Pure and Lightning ID customization (PuLID), a novel tuning-free\nID customization method for text-to-image generation. By incorporating a\nLightning T2I branch with a standard diffusion one, PuLID introduces both\ncontrastive alignment loss and accurate ID loss, minimizing disruption to the\noriginal model and ensuring high ID fidelity. Experiments show that PuLID\nachieves superior performance in both ID fidelity and editability. Another\nattractive property of PuLID is that the image elements (e.g., background,\nlighting, composition, and style) before and after the ID insertion are kept as\nconsistent as possible. Codes and models are available at\nhttps://github.com/ToTheBeginning/PuLID\n","authors":["Zinan Guo","Yanze Wu","Zhuowei Chen","Lang Chen","Peng Zhang","Qian He"],"pdf_url":"https://arxiv.org/pdf/2404.16022v2.pdf","comment":"NeurIPS 2024. Codes and models are available at\n https://github.com/ToTheBeginning/PuLID"},{"id":"http://arxiv.org/abs/2410.23854v1","updated":"2024-10-31T12:04:30Z","published":"2024-10-31T12:04:30Z","title":"Airway Labeling Meets Clinical Applications: Reflecting Topology\n Consistency and Outliers via Learnable Attentions","summary":" Accurate airway anatomical labeling is crucial for clinicians to identify and\nnavigate complex bronchial structures during bronchoscopy. Automatic airway\nanatomical labeling is challenging due to significant individual variability\nand anatomical variations. Previous methods are prone to generate inconsistent\npredictions, which is harmful for preoperative planning and intraoperative\nnavigation. This paper aims to address these challenges by proposing a novel\nmethod that enhances topological consistency and improves the detection of\nabnormal airway branches.\n We propose a novel approach incorporating two modules: the Soft Subtree\nConsistency (SSC) and the Abnormal Branch Saliency (ABS). The SSC module\nconstructs a soft subtree to capture clinically relevant topological\nrelationships, allowing for flexible feature aggregation within and across\nsubtrees. The ABS module facilitates the interaction between node features and\nprototypes to distinguish abnormal branches, preventing the erroneous\naggregation of features between normal and abnormal nodes.\n Evaluated on a challenging dataset characterized by severe airway distortion\nand atrophy, our method achieves superior performance compared to\nstate-of-the-art approaches. Specifically, it attains a 91.4% accuracy at the\nsegmental level and an 83.7% accuracy at the subsegmental level, representing a\n1.4% increase in subsegmental accuracy and a 3.1% increase in topological\nconsistency. Notably, the method demonstrates reliable performance in cases\nwith disease-induced airway deformities, ensuring consistent and accurate\nlabeling.\n","authors":["Chenyu Li","Minghui Zhang","Chuyan Zhang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2410.23854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19370v2","updated":"2024-10-31T12:00:26Z","published":"2024-09-28T14:50:45Z","title":"MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN\n Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation","summary":" Segmenting anatomical structures and lesions from ultrasound images\ncontributes to disease assessment. Weakly supervised learning (WSL) based on\nsparse annotation has achieved encouraging performance and demonstrated the\npotential to reduce annotation costs. This study attempts to introduce\nscribble-based WSL into ultrasound image segmentation tasks. However,\nultrasound images often suffer from poor contrast and unclear edges, coupled\nwith insufficient supervison signals for edges, posing challenges to edge\nprediction. Uncertainty modeling has been proven to facilitate models in\ndealing with these issues. Nevertheless, existing uncertainty estimation\nparadigms are not robust enough and often filter out predictions near decision\nboundaries, resulting in unstable edge predictions. Therefore, we propose\nleveraging predictions near decision boundaries effectively. Specifically, we\nintroduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided\nConsistency strategy. This strategy utilizes high-evidence predictions, which\nare more likely to occur near high-density regions, to guide the optimization\nof low-evidence predictions that may appear near decision boundaries.\nFurthermore, the diverse sizes and locations of lesions in ultrasound images\npose a challenge for CNNs with local receptive fields, as they struggle to\nmodel global information. Therefore, we introduce Visual Mamba based on\nstructured state space sequence models, which achieves long-range dependency\nwith linear computational complexity, and we construct a novel hybrid CNN-Mamba\nframework. During training, the collaboration between the CNN branch and the\nMamba branch in the proposed framework draws inspiration from each other based\non the EGC strategy. Experiments demonstrate the competitiveness of the\nproposed method. Dataset and code will be available on\nhttps://github.com/GtLinyer/MambaEviScrib.\n","authors":["Xiaoxiang Han","Xinyu Li","Jiang Shang","Yiman Liu","Keyan Chen","Shugong Xu","Qiaohong Liu","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.19370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12490v2","updated":"2024-10-31T11:42:07Z","published":"2024-10-16T12:13:17Z","title":"Stabilize the Latent Space for Image Autoregressive Modeling: A Unified\n Perspective","summary":" Latent-based image generative models, such as Latent Diffusion Models (LDMs)\nand Mask Image Models (MIMs), have achieved notable success in image generation\ntasks. These models typically leverage reconstructive autoencoders like VQGAN\nor VAE to encode pixels into a more compact latent space and learn the data\ndistribution in the latent space instead of directly from pixels. However, this\npractice raises a pertinent question: Is it truly the optimal choice? In\nresponse, we begin with an intriguing observation: despite sharing the same\nlatent space, autoregressive models significantly lag behind LDMs and MIMs in\nimage generation. This finding contrasts sharply with the field of NLP, where\nthe autoregressive model GPT has established a commanding presence. To address\nthis discrepancy, we introduce a unified perspective on the relationship\nbetween latent space and generative models, emphasizing the stability of latent\nspace in image generative modeling. Furthermore, we propose a simple but\neffective discrete image tokenizer to stabilize the latent space for image\ngenerative modeling by applying K-Means on the latent features of\nself-supervised learning models. Experimental results show that image\nautoregressive modeling with our tokenizer (DiGIT) benefits both image\nunderstanding and image generation with the next token prediction principle,\nwhich is inherently straightforward for GPT models but challenging for other\ngenerative models. Remarkably, for the first time, a GPT-style autoregressive\nmodel for images outperforms LDMs, which also exhibits substantial improvement\nakin to GPT when scaling up model size. Our findings underscore the potential\nof an optimized latent space and the integration of discrete tokenization in\nadvancing the capabilities of image generative models. The code is available at\n\\url{https://github.com/DAMO-NLP-SG/DiGIT}.\n","authors":["Yongxin Zhu","Bocheng Li","Hang Zhang","Xin Li","Linli Xu","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2410.12490v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23836v1","updated":"2024-10-31T11:32:33Z","published":"2024-10-31T11:32:33Z","title":"Stereo-Talker: Audio-driven 3D Human Synthesis with Prior-Guided\n Mixture-of-Experts","summary":" This paper introduces Stereo-Talker, a novel one-shot audio-driven human\nvideo synthesis system that generates 3D talking videos with precise lip\nsynchronization, expressive body gestures, temporally consistent\nphoto-realistic quality, and continuous viewpoint control. The process follows\na two-stage approach. In the first stage, the system maps audio input to\nhigh-fidelity motion sequences, encompassing upper-body gestures and facial\nexpressions. To enrich motion diversity and authenticity, large language model\n(LLM) priors are integrated with text-aligned semantic audio features,\nleveraging LLMs' cross-modal generalization power to enhance motion quality. In\nthe second stage, we improve diffusion-based video generation models by\nincorporating a prior-guided Mixture-of-Experts (MoE) mechanism: a view-guided\nMoE focuses on view-specific attributes, while a mask-guided MoE enhances\nregion-based rendering stability. Additionally, a mask prediction module is\ndevised to derive human masks from motion data, enhancing the stability and\naccuracy of masks and enabling mask guiding during inference. We also introduce\na comprehensive human video dataset with 2,203 identities, covering diverse\nbody gestures and detailed annotations, facilitating broad generalization. The\ncode, data, and pre-trained models will be released for research purposes.\n","authors":["Xiang Deng","Youxin Pang","Xiaochen Zhao","Chao Xu","Lizhen Wang","Hongjiang Xiao","Shi Yan","Hongwen Zhang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14927v3","updated":"2024-10-31T11:32:19Z","published":"2024-06-21T07:37:17Z","title":"GIC: Gaussian-Informed Continuum for Physical Property Identification\n and Simulation","summary":" This paper studies the problem of estimating physical properties (system\nidentification) through visual observations. To facilitate geometry-aware\nguidance in physical property estimation, we introduce a novel hybrid framework\nthat leverages 3D Gaussian representation to not only capture explicit shapes\nbut also enable the simulated continuum to render object masks as 2D shape\nsurrogates during training. We propose a new dynamic 3D Gaussian framework\nbased on motion factorization to recover the object as 3D Gaussian point sets\nacross different time states. Furthermore, we develop a coarse-to-fine filling\nstrategy to generate the density fields of the object from the Gaussian\nreconstruction, allowing for the extraction of object continuums along with\ntheir surfaces and the integration of Gaussian attributes into these continuum.\nIn addition to the extracted object surfaces, the Gaussian-informed continuum\nalso enables the rendering of object masks during simulations, serving as\n2D-shape guidance for physical property estimation. Extensive experimental\nevaluations demonstrate that our pipeline achieves state-of-the-art performance\nacross multiple benchmarks and metrics. Additionally, we illustrate the\neffectiveness of the proposed method through real-world demonstrations,\nshowcasing its practical utility. Our project page is at\nhttps://jukgei.github.io/project/gic.\n","authors":["Junhao Cai","Yuji Yang","Weihao Yuan","Yisheng He","Zilong Dong","Liefeng Bo","Hui Cheng","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14927v3.pdf","comment":"21 pages, 8 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23835v1","updated":"2024-10-31T11:29:41Z","published":"2024-10-31T11:29:41Z","title":"Counterfactual MRI Data Augmentation using Conditional Denoising\n Diffusion Generative Models","summary":" Deep learning (DL) models in medical imaging face challenges in\ngeneralizability and robustness due to variations in image acquisition\nparameters (IAP). In this work, we introduce a novel method using conditional\ndenoising diffusion generative models (cDDGMs) to generate counterfactual\nmagnetic resonance (MR) images that simulate different IAP without altering\npatient anatomy. We demonstrate that using these counterfactual images for data\naugmentation can improve segmentation accuracy, particularly in\nout-of-distribution settings, enhancing the overall generalizability and\nrobustness of DL models across diverse imaging conditions. Our approach shows\npromise in addressing domain and covariate shifts in medical imaging. The code\nis publicly available at https:\n//github.com/pedromorao/Counterfactual-MRI-Data-Augmentation\n","authors":["Pedro Morão","Joao Santinha","Yasna Forghani","Nuno Loução","Pedro Gouveia","Mario A. T. Figueiredo"],"pdf_url":"https://arxiv.org/pdf/2410.23835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07966v3","updated":"2024-10-31T11:25:40Z","published":"2024-09-12T11:53:05Z","title":"ProbTalk3D: Non-Deterministic Emotion Controllable Speech-Driven 3D\n Facial Animation Synthesis Using VQ-VAE","summary":" Audio-driven 3D facial animation synthesis has been an active field of\nresearch with attention from both academia and industry. While there are\npromising results in this area, recent approaches largely focus on lip-sync and\nidentity control, neglecting the role of emotions and emotion control in the\ngenerative process. That is mainly due to the lack of emotionally rich facial\nanimation data and algorithms that can synthesize speech animations with\nemotional expressions at the same time. In addition, majority of the models are\ndeterministic, meaning given the same audio input, they produce the same output\nmotion. We argue that emotions and non-determinism are crucial to generate\ndiverse and emotionally-rich facial animations. In this paper, we propose\nProbTalk3D a non-deterministic neural network approach for emotion controllable\nspeech-driven 3D facial animation synthesis using a two-stage VQ-VAE model and\nan emotionally rich facial animation dataset 3DMEAD. We provide an extensive\ncomparative analysis of our model against the recent 3D facial animation\nsynthesis approaches, by evaluating the results objectively, qualitatively, and\nwith a perceptual user study. We highlight several objective metrics that are\nmore suitable for evaluating stochastic outputs and use both in-the-wild and\nground truth data for subjective evaluation. To our knowledge, that is the\nfirst non-deterministic 3D facial animation synthesis method incorporating a\nrich emotion dataset and emotion control with emotion labels and intensity\nlevels. Our evaluation demonstrates that the proposed model achieves superior\nperformance compared to state-of-the-art emotion-controlled, deterministic and\nnon-deterministic models. We recommend watching the supplementary video for\nquality judgement. The entire codebase is publicly available\n(https://github.com/uuembodiedsocialai/ProbTalk3D/).\n","authors":["Sichun Wu","Kazi Injamamul Haque","Zerrin Yumak"],"pdf_url":"https://arxiv.org/pdf/2409.07966v3.pdf","comment":"14 pages, 9 figures, 3 tables. Includes code. Accepted at ACM\n SIGGRAPH MIG 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.12036v2","updated":"2024-10-31T23:08:03Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14094v2","updated":"2024-10-31T21:26:01Z","published":"2024-07-19T07:58:26Z","title":"User-Creator Feature Polarization in Recommender Systems with Dual\n Influence","summary":" Recommender systems serve the dual purpose of presenting relevant content to\nusers and helping content creators reach their target audience. The dual nature\nof these systems naturally influences both users and creators: users'\npreferences are affected by the items they are recommended, while creators may\nbe incentivized to alter their content to attract more users. We define a\nmodel, called user-creator feature dynamics, to capture the dual influence of\nrecommender systems. We prove that a recommender system with dual influence is\nguaranteed to polarize, causing diversity loss in the system. We then\ninvestigate, both theoretically and empirically, approaches for mitigating\npolarization and promoting diversity in recommender systems. Unexpectedly, we\nfind that common diversity-promoting approaches do not work in the presence of\ndual influence, while relevancy-optimizing methods like top-$k$ truncation can\nprevent polarization and improve diversity of the system.\n","authors":["Tao Lin","Kun Jin","Andrew Estornell","Xiaoying Zhang","Yiling Chen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14094v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.08571v2","updated":"2024-10-31T20:30:51Z","published":"2024-07-11T14:59:17Z","title":"Multi-Group Proportional Representation in Retrieval","summary":" Image search and retrieval tasks can perpetuate harmful stereotypes, erase\ncultural identities, and amplify social disparities. Current approaches to\nmitigate these representational harms balance the number of retrieved items\nacross population groups defined by a small number of (often binary)\nattributes. However, most existing methods overlook intersectional groups\ndetermined by combinations of group attributes, such as gender, race, and\nethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel\nmetric that measures representation across intersectional groups. We develop\npractical methods for estimating MPR, provide theoretical guarantees, and\npropose optimization algorithms to ensure MPR in retrieval. We demonstrate that\nexisting methods optimizing for equal and proportional representation metrics\nmay fail to promote MPR. Crucially, our work shows that optimizing MPR yields\nmore proportional representation across multiple intersectional groups\nspecified by a rich function class, often with minimal compromise in retrieval\naccuracy.\n","authors":["Alex Oesterling","Claudio Mayrink Verdun","Carol Xuan Long","Alexander Glynn","Lucas Monteiro Paes","Sajani Vithana","Martina Cardone","Flavio P. Calmon"],"pdf_url":"https://arxiv.org/pdf/2407.08571v2.pdf","comment":"48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be\n found at\n https://github.com/alex-oesterling/multigroup-proportional-representation"},{"id":"http://arxiv.org/abs/2403.19302v3","updated":"2024-10-31T19:37:07Z","published":"2024-03-28T10:40:22Z","title":"Generating Multi-Aspect Queries for Conversational Search","summary":" Conversational information seeking (CIS) systems aim to model the user's\ninformation need within the conversational context and retrieve the relevant\ninformation. One major approach to modeling the conversational context aims to\nrewrite the user utterance in the conversation to represent the information\nneed independently. Recent work has shown the benefit of expanding the\nrewritten utterance with relevant terms. In this work, we hypothesize that\nbreaking down the information of an utterance into multi-aspect rewritten\nqueries can lead to more effective retrieval performance. This is more evident\nin more complex utterances that require gathering evidence from various\ninformation sources, where a single query rewrite or query representation\ncannot capture the complexity of the utterance. To test this hypothesis, we\nconduct extensive experiments on five widely used CIS datasets where we\nleverage LLMs to generate multi-aspect queries to represent the information\nneed for each utterance in multiple query rewrites. We show that, for most of\nthe utterances, the same retrieval model would perform better with more than\none rewritten query by 85% in terms of nDCG@3. We further propose a\nmulti-aspect query generation and retrieval framework, called MQ4CS. Our\nextensive experiments show that MQ4CS outperforms the state-of-the-art query\nrewriting methods. We make our code and our new dataset of generated\nmulti-aspect queries publicly available.\n","authors":["Zahra Abbasiantaeb","Simon Lupart","Mohammad Aliannejadi"],"pdf_url":"https://arxiv.org/pdf/2403.19302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13959v2","updated":"2024-10-31T18:38:37Z","published":"2024-10-17T18:34:43Z","title":"FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven\n Question Answering Pipeline","summary":" Financial decision-making hinges on the analysis of relevant information\nembedded in the enormous volume of documents in the financial domain. To\naddress this challenge, we developed FinQAPT, an end-to-end pipeline that\nstreamlines the identification of relevant financial reports based on a query,\nextracts pertinent context, and leverages Large Language Models (LLMs) to\nperform downstream tasks. To evaluate the pipeline, we experimented with\nvarious techniques to optimize the performance of each module using the FinQA\ndataset. We introduced a novel clustering-based negative sampling technique to\nenhance context extraction and a novel prompting method called Dynamic N-shot\nPrompting to boost the numerical question-answering capabilities of LLMs. At\nthe module level, we achieved state-of-the-art accuracy on FinQA, attaining an\naccuracy of 80.6%. However, at the pipeline level, we observed decreased\nperformance due to challenges in extracting relevant context from financial\nreports. We conducted a detailed error analysis of each module and the\nend-to-end pipeline, pinpointing specific challenges that must be addressed to\ndevelop a robust solution for handling complex financial tasks.\n","authors":["Kuldeep Singh","Simerjot Kaur","Charese Smiley"],"pdf_url":"https://arxiv.org/pdf/2410.13959v2.pdf","comment":"Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.24200v1","updated":"2024-10-31T17:55:36Z","published":"2024-10-31T17:55:36Z","title":"Length-Induced Embedding Collapse in Transformer-based Models","summary":" Text embeddings enable various applications, but their performance\ndeteriorates on longer texts. In this paper, we find that the performance\ndegradation is due to a phenomenon called Length Collapse, where longer text\nembeddings collapse into a narrow space. This collapse results in a\ndistributional inconsistency between embeddings of different text lengths,\nultimately hurting the performance of downstream tasks. Theoretically, by\nconsidering the self-attention mechanism inherently functions as a low-pass\nfilter, we prove that long sequences increase the attenuation rate of the\nlow-pass filter effect of the self-attention mechanism. With layers going\ndeeper, excessive low-pass filtering causes the token signals to retain only\ntheir Direct-Current (DC) component, which means the input token feature maps\nwill collapse into a narrow space, especially in long texts. Based on the above\nanalysis, we propose to mitigate the undesirable length collapse limitation by\nintroducing a temperature in softmax(), which achieves a higher low-filter\nattenuation rate. The tuning-free method, called TempScale, can be plugged into\nmultiple transformer-based embedding models. Empirically, we demonstrate that\nTempScale can improve existing embedding models, especially on long text\ninputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text\nEmbedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from\nLongEmbed, which specifically focuses on long context retrieval.\n","authors":["Yuqi Zhou","Sunhao Dai","Zhanshuo Cao","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19073v2","updated":"2024-10-31T15:58:23Z","published":"2024-05-29T13:31:12Z","title":"An engine not a camera: Measuring performative power of online search","summary":" The power of digital platforms is at the center of major ongoing policy and\nregulatory efforts. To advance existing debates, we designed and executed an\nexperiment to measure the performative power of online search providers.\nInstantiated in our setting, performative power quantifies the ability of a\nsearch engine to steer web traffic by rearranging results. To operationalize\nthis definition we developed a browser extension that performs unassuming\nrandomized experiments in the background. These randomized experiments emulate\nupdates to the search algorithm and identify the causal effect of different\ncontent arrangements on clicks. Analyzing tens of thousands of clicks, we\ndiscuss what our robust quantitative findings say about the power of online\nsearch engines, using the Google Shopping antitrust investigation as a case\nstudy. More broadly, we envision our work to serve as a blueprint for how the\nrecent definition of performative power can help integrate quantitative\ninsights from online experiments with future investigations into the economic\npower of digital platforms.\n","authors":["Celestine Mendler-Dünner","Gabriele Carovano","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2405.19073v2.pdf","comment":"to appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23879v1","updated":"2024-10-31T12:40:38Z","published":"2024-10-31T12:40:38Z","title":"Investigating Bias in Political Search Query Suggestions by Relative\n Comparison with LLMs","summary":" Search query suggestions affect users' interactions with search engines,\nwhich then influences the information they encounter. Thus, bias in search\nquery suggestions can lead to exposure to biased search results and can impact\nopinion formation. This is especially critical in the political domain.\nDetecting and quantifying bias in web search engines is difficult due to its\ntopic dependency, complexity, and subjectivity. The lack of context and\nphrasality of query suggestions emphasizes this problem. In a multi-step\napproach, we combine the benefits of large language models, pairwise\ncomparison, and Elo-based scoring to identify and quantify bias in English\nsearch query suggestions. We apply our approach to the U.S. political news\ndomain and compare bias in Google and Bing.\n","authors":["Fabian Haak","Björn Engelmann","Christin Katharina Kreutz","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2410.23879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23851v1","updated":"2024-10-31T12:01:51Z","published":"2024-10-31T12:01:51Z","title":"Leveraging Large Language Models for Medical Information Extraction and\n Query Generation","summary":" This paper introduces a system that integrates large language models (LLMs)\ninto the clinical trial retrieval process, enhancing the effectiveness of\nmatching patients with eligible trials while maintaining information privacy\nand allowing expert oversight. We evaluate six LLMs for query generation,\nfocusing on open-source and relatively small models that require minimal\ncomputational resources. Our evaluation includes two closed-source and four\nopen-source models, with one specifically trained in the medical field and five\ngeneral-purpose models. We compare the retrieval effectiveness achieved by\nLLM-generated queries against those created by medical experts and\nstate-of-the-art methods from the literature. Our findings indicate that the\nevaluated models reach retrieval effectiveness on par with or greater than\nexpert-created queries. The LLMs consistently outperform standard baselines and\nother approaches in the literature. The best performing LLMs exhibit fast\nresponse times, ranging from 1.7 to 8 seconds, and generate a manageable number\nof query terms (15-63 on average), making them suitable for practical\nimplementation. Our overall findings suggest that leveraging small, open-source\nLLMs for clinical trials retrieval can balance performance, computational\nefficiency, and real-world applicability in medical settings.\n","authors":["Georgios Peikos","Pranav Kasela","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2410.23851v1.pdf","comment":"Accepted in WI-IAT '24"},{"id":"http://arxiv.org/abs/2410.23842v1","updated":"2024-10-31T11:49:16Z","published":"2024-10-31T11:49:16Z","title":"Auditing Google's Search Algorithm: Measuring News Diversity Across\n Brazil, the UK, and the US","summary":" This study examines the influence of Google's search algorithm on news\ndiversity by analyzing search results in Brazil, the UK, and the US. It\nexplores how Google's system preferentially favors a limited number of news\noutlets. Utilizing algorithm auditing techniques, the research measures source\nconcentration with the Herfindahl-Hirschman Index (HHI) and Gini coefficient,\nrevealing significant concentration trends. The study underscores the\nimportance of conducting horizontal analyses across multiple search queries, as\nfocusing solely on individual results pages may obscure these patterns. Factors\nsuch as popularity, political bias, and recency were evaluated for their impact\non news rankings. Findings indicate a slight leftward bias in search outcomes\nand a preference for popular, often national outlets. This bias, combined with\na tendency to prioritize recent content, suggests that Google's algorithm may\nreinforce existing media inequalities. By analyzing the largest dataset to date\n-- 221,863 search results -- this research provides comprehensive, longitudinal\ninsights into how algorithms shape public access to diverse news sources.\n","authors":["Raphael Hernandes","Giulio Corsi"],"pdf_url":"https://arxiv.org/pdf/2410.23842v1.pdf","comment":"21 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2410.23841v1","updated":"2024-10-31T11:47:21Z","published":"2024-10-31T11:47:21Z","title":"Beyond Content Relevance: Evaluating Instruction Following in Retrieval\n Models","summary":" Instruction-following capabilities in large language models (LLMs) have\nsignificantly progressed, enabling more complex user interactions through\ndetailed prompts. However, retrieval systems have not matched these advances,\nmost of them still relies on traditional lexical and semantic matching\ntechniques that fail to fully capture user intent. Recent efforts have\nintroduced instruction-aware retrieval models, but these primarily focus on\nintrinsic content relevance, which neglects the importance of customized\npreferences for broader document-level attributes. This study evaluates the\ninstruction-following capabilities of various retrieval models beyond content\nrelevance, including LLM-based dense retrieval and reranking models. We develop\nInfoSearch, a novel retrieval evaluation benchmark spanning six document-level\nattributes: Audience, Keyword, Format, Language, Length, and Source, and\nintroduce novel metrics -- Strict Instruction Compliance Ratio (SICR) and\nWeighted Instruction Sensitivity Evaluation (WISE) to accurately assess the\nmodels' responsiveness to instructions. Our findings reveal that while\nreranking models generally surpass retrieval models in instruction following,\nthey still face challenges in handling certain attributes. Moreover, although\ninstruction fine-tuning and increased model size lead to better performance,\nmost models fall short of achieving comprehensive instruction compliance as\nassessed by our benchmark.\n","authors":["Jianqun Zhou","Yuanlei Zheng","Wei Chen","Qianqian Zheng","Zeyuan Shang","Wei Zhang","Rui Meng","Xiaoyu Shen"],"pdf_url":"https://arxiv.org/pdf/2410.23841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21488v2","updated":"2024-10-31T11:45:00Z","published":"2024-07-31T09:52:53Z","title":"Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing\n the Upper Bound of Generative Retrieval","summary":" Generative retrieval (GR) has emerged as a transformative paradigm in search\nand recommender systems, leveraging numeric-based identifier representations to\nenhance efficiency and generalization. Notably, methods like TIGER employing\nResidual Quantization-based Semantic Identifiers (RQ-SID), have shown\nsignificant promise in e-commerce scenarios by effectively managing item IDs.\nHowever, a critical issue termed the \"\\textbf{Hourglass}\" phenomenon, occurs in\nRQ-SID, where intermediate codebook tokens become overly concentrated,\nhindering the full utilization of generative retrieval methods. This paper\nanalyses and addresses this problem by identifying data sparsity and\nlong-tailed distribution as the primary causes. Through comprehensive\nexperiments and detailed ablation studies, we analyze the impact of these\nfactors on codebook utilization and data distribution. Our findings reveal that\nthe \"Hourglass\" phenomenon substantially impacts the performance of RQ-SID in\ngenerative retrieval. We propose effective solutions to mitigate this issue,\nthereby significantly enhancing the effectiveness of generative retrieval in\nreal-world E-commerce applications.\n","authors":["Zhirui Kuai","Zuxu Chen","Huimu Wang","Mingming Li","Dadong Miao","Binbin Wang","Xusong Chen","Li Kuang","Yuxing Han","Jiaxing Wang","Guoyu Tang","Lin Liu","Songlin Wang","Jingwei Zhuo"],"pdf_url":"https://arxiv.org/pdf/2407.21488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15187v2","updated":"2024-10-31T10:10:28Z","published":"2024-06-21T14:29:39Z","title":"UDA: A Benchmark Suite for Retrieval Augmented Generation in Real-world\n Document Analysis","summary":" The use of Retrieval-Augmented Generation (RAG) has improved Large Language\nModels (LLMs) in collaborating with external data, yet significant challenges\nexist in real-world scenarios. In areas such as academic literature and finance\nquestion answering, data are often found in raw text and tables in HTML or PDF\nformats, which can be lengthy and highly unstructured. In this paper, we\nintroduce a benchmark suite, namely Unstructured Document Analysis (UDA), that\ninvolves 2,965 real-world documents and 29,590 expert-annotated Q&A pairs. We\nrevisit popular LLM- and RAG-based solutions for document analysis and evaluate\nthe design choices and answer qualities across multiple document domains and\ndiverse query types. Our evaluation yields interesting findings and highlights\nthe importance of data parsing and retrieval. We hope our benchmark can shed\nlight and better serve real-world document analysis applications. The benchmark\nsuite and code can be found at https://github.com/qinchuanhui/UDA-Benchmark.\n","authors":["Yulong Hui","Yao Lu","Huanchen Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.15187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23757v1","updated":"2024-10-31T09:24:22Z","published":"2024-10-31T09:24:22Z","title":"Identify Then Recommend: Towards Unsupervised Group Recommendation","summary":" Group Recommendation (GR), which aims to recommend items to groups of users,\nhas become a promising and practical direction for recommendation systems. This\npaper points out two issues of the state-of-the-art GR models. (1) The\npre-defined and fixed number of user groups is inadequate for real-time\nindustrial recommendation systems, where the group distribution can shift\ndynamically. (2) The training schema of existing GR methods is supervised,\nnecessitating expensive user-group and group-item labels, leading to\nsignificant annotation costs. To this end, we present a novel unsupervised\ngroup recommendation framework named \\underline{I}dentify \\underline{T}hen\n\\underline{R}ecommend (\\underline{ITR}), where it first identifies the user\ngroups in an unsupervised manner even without the pre-defined number of groups,\nand then two pre-text tasks are designed to conduct self-supervised group\nrecommendation. Concretely, at the group identification stage, we first\nestimate the adaptive density of each user point, where areas with higher\ndensities are more likely to be recognized as group centers. Then, a heuristic\nmerge-and-split strategy is designed to discover the user groups and decision\nboundaries. Subsequently, at the self-supervised learning stage, the\npull-and-repulsion pre-text task is proposed to optimize the user-group\ndistribution. Besides, the pseudo group recommendation pre-text task is\ndesigned to assist the recommendations. Extensive experiments demonstrate the\nsuperiority and effectiveness of ITR on both user recommendation (e.g., 22.22\\%\nNDCG@5 $\\uparrow$) and group recommendation (e.g., 22.95\\% NDCG@5 $\\uparrow$).\nFurthermore, we deploy ITR on the industrial recommender and achieve promising\nresults.\n","authors":["Yue Liu","Shihao Zhu","Tianyuan Yang","Jian Ma","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23757v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2401.05975v4","updated":"2024-10-31T09:14:56Z","published":"2024-01-11T15:22:55Z","title":"End-to-end Learnable Clustering for Intent Learning in Recommendation","summary":" Intent learning, which aims to learn users' intents for user understanding\nand item recommendation, has become a hot research spot in recent years.\nHowever, existing methods suffer from complex and cumbersome alternating\noptimization, limiting performance and scalability. To this end, we propose a\nnovel intent learning method termed \\underline{ELCRec}, by unifying behavior\nrepresentation learning into an \\underline{E}nd-to-end \\underline{L}earnable\n\\underline{C}lustering framework, for effective and efficient\n\\underline{Rec}ommendation. Concretely, we encode user behavior sequences and\ninitialize the cluster centers (latent intents) as learnable neurons. Then, we\ndesign a novel learnable clustering module to separate different cluster\ncenters, thus decoupling users' complex intents. Meanwhile, it guides the\nnetwork to learn intents from behaviors by forcing behavior embeddings close to\ncluster centers. This allows simultaneous optimization of recommendation and\nclustering via mini-batch data. Moreover, we propose intent-assisted\ncontrastive learning by using cluster centers as self-supervision signals,\nfurther enhancing mutual promotion. Both experimental results and theoretical\nanalyses demonstrate the superiority of ELCRec from six perspectives. Compared\nto the runner-up, ELCRec improves NDCG@5 by 8.9\\% and reduces computational\ncosts by 22.5\\% on the Beauty dataset. Furthermore, due to the scalability and\nuniversal applicability, we deploy this method on the industrial recommendation\nsystem with 130 million page views and achieve promising results. The codes are\navailable on GitHub (https://github.com/yueliu1999/ELCRec). A collection\n(papers, codes, datasets) of deep group recommendation/intent learning methods\nis available on GitHub\n(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation).\n","authors":["Yue Liu","Shihao Zhu","Jun Xia","Yingwei Ma","Jian Ma","Xinwang Liu","Shengju Yu","Kejun Zhang","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.05975v4.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2410.23736v1","updated":"2024-10-31T08:49:05Z","published":"2024-10-31T08:49:05Z","title":"MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed\n Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging vision-language task,\nutilizing bi-modal (image+text) queries to retrieve target images. Despite the\nimpressive performance of supervised CIR, the dependence on costly,\nmanually-labeled triplets limits its scalability and zero-shot capability. To\naddress this issue, zero-shot composed image retrieval (ZS-CIR) is presented\nalong with projection-based approaches. However, such methods face two major\nproblems, i.e., task discrepancy between pre-training (image $\\leftrightarrow$\ntext) and inference (image+text $\\rightarrow$ image), and modality discrepancy.\nThe latter pertains to approaches based on text-only projection training due to\nthe necessity of feature extraction from the reference image during inference.\nIn this paper, we propose a two-stage framework to tackle both discrepancies.\nFirst, to ensure efficiency and scalability, a textual inversion network is\npre-trained on large-scale caption datasets. Subsequently, we put forward\nModality-Task Dual Alignment (MoTaDual) as the second stage, where\nlarge-language models (LLMs) generate triplet data for fine-tuning, and\nadditionally, prompt learning is introduced in a multi-modal context to\neffectively alleviate both modality and task discrepancies. The experimental\nresults show that our MoTaDual achieves the state-of-the-art performance across\nfour widely used ZS-CIR benchmarks, while maintaining low training time and\ncomputational cost. The code will be released soon.\n","authors":["Haiwen Li","Fei Su","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.23736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00072v5","updated":"2024-10-31T08:35:42Z","published":"2024-06-21T08:52:11Z","title":"Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback","summary":" RAG systems face limitations when semantic relevance alone does not guarantee\nimproved generation quality. This issue becomes particularly evident due to the\nsensitivity of large language models (LLMs) to the ordering of few-shot\nprompts, which can affect model performance. To address this challenge,\naligning LLM outputs with human preferences using structured feedback, such as\noptions to copy, regenerate, or dislike, offers a promising method for\nimprovement. This feedback is applied to the entire list of inputs rather than\ngiving specific ratings for individual documents, making it a Listwide Labels\nLearning-to-Rank task.\n To address this task, we propose Pistis-RAG, a new RAG framework designed\nwith a content-centric approach to better align LLMs with human preferences.\nPistis-RAG effectively utilizes human feedback, enhancing content ranking and\ngeneration quality. To validate our framework, we use public datasets to\nsimulate human feedback, allowing us to evaluate and refine our method\neffectively. Experimental results indicate that Pistis-RAG improves alignment\nwith human preferences relative to the baseline RAG system, showing a 6.06%\nincrease in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy\nmetrics. These results highlight Pistis-RAG's effectiveness in overcoming the\nlimitations associated with traditional RAG approaches.\n","authors":["Yu Bai","Yukai Miao","Li Chen","Dawei Wang","Dan Li","Yanyu Ren","Hongtao Xie","Ce Yang","Xuhui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.00072v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23715v1","updated":"2024-10-31T08:03:13Z","published":"2024-10-31T08:03:13Z","title":"Towards Cross-Modal Text-Molecule Retrieval with Better Modality\n Alignment","summary":" Cross-modal text-molecule retrieval model aims to learn a shared feature\nspace of the text and molecule modalities for accurate similarity calculation,\nwhich facilitates the rapid screening of molecules with specific properties and\nactivities in drug design. However, previous works have two main defects.\nFirst, they are inadequate in capturing modality-shared features considering\nthe significant gap between text sequences and molecule graphs. Second, they\nmainly rely on contrastive learning and adversarial training for cross-modality\nalignment, both of which mainly focus on the first-order similarity, ignoring\nthe second-order similarity that can capture more structural information in the\nembedding space. To address these issues, we propose a novel cross-modal\ntext-molecule retrieval model with two-fold improvements. Specifically, on the\ntop of two modality-specific encoders, we stack a memory bank based feature\nprojector that contain learnable memory vectors to extract modality-shared\nfeatures better. More importantly, during the model training, we calculate four\nkinds of similarity distributions (text-to-text, text-to-molecule,\nmolecule-to-molecule, and molecule-to-text similarity distributions) for each\ninstance, and then minimize the distance between these similarity distributions\n(namely second-order similarity losses) to enhance cross-modal alignment.\nExperimental results and analysis strongly demonstrate the effectiveness of our\nmodel. Particularly, our model achieves SOTA performance, outperforming the\npreviously-reported best result by 6.4%.\n","authors":["Jia Song","Wanru Zhuang","Yujie Lin","Liang Zhang","Chunyan Li","Jinsong Su","Song He","Xiaochen Bo"],"pdf_url":"https://arxiv.org/pdf/2410.23715v1.pdf","comment":"BIBM 2024 regular paper"},{"id":"http://arxiv.org/abs/2408.11611v3","updated":"2024-10-31T07:09:38Z","published":"2024-08-21T13:39:21Z","title":"DTN: Deep Multiple Task-specific Feature Interactions Network for\n Multi-Task Recommendation","summary":" Neural-based multi-task learning (MTL) has been successfully applied to many\nrecommendation applications. However, these MTL models (e.g., MMoE, PLE) did\nnot consider feature interaction during the optimization, which is crucial for\ncapturing complex high-order features and has been widely used in ranking\nmodels for real-world recommender systems. Moreover, through feature importance\nanalysis across various tasks in MTL, we have observed an interesting\ndivergence phenomenon that the same feature can have significantly different\nimportance across different tasks in MTL. To address these issues, we propose\nDeep Multiple Task-specific Feature Interactions Network (DTN) with a novel\nmodel structure design. DTN introduces multiple diversified task-specific\nfeature interaction methods and task-sensitive network in MTL networks,\nenabling the model to learn task-specific diversified feature interaction\nrepresentations, which improves the efficiency of joint representation learning\nin a general setup. We applied DTN to our company's real-world E-commerce\nrecommendation dataset, which consisted of over 6.3 billion samples, the\nresults demonstrated that DTN significantly outperformed state-of-the-art MTL\nmodels. Moreover, during online evaluation of DTN in a large-scale E-commerce\nrecommender system, we observed a 3.28% in clicks, a 3.10% increase in orders\nand a 2.70% increase in GMV (Gross Merchandise Value) compared to the\nstate-of-the-art MTL models. Finally, extensive offline experiments conducted\non public benchmark datasets demonstrate that DTN can be applied to various\nscenarios beyond recommendations, enhancing the performance of ranking models.\n","authors":["Yaowen Bi","Yuteng Lian","Jie Cui","Jun Liu","Peijian Wang","Guanghui Li","Xuejun Chen","Jinglin Zhao","Hao Wen","Jing Zhang","Zhaoqi Zhang","Wenzhuo Song","Yang Sun","Weiwei Zhang","Mingchen Cai","Jian Dong","Guanxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11611v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11821v3","updated":"2024-10-31T05:19:58Z","published":"2024-02-19T04:29:45Z","title":"Microstructures and Accuracy of Graph Recall by Large Language Models","summary":" Graphs data is crucial for many applications, and much of it exists in the\nrelations described in textual format. As a result, being able to accurately\nrecall and encode a graph described in earlier text is a basic yet pivotal\nability that LLMs need to demonstrate if they are to perform reasoning tasks\nthat involve graph-structured information. Human performance at graph recall\nhas been studied by cognitive scientists for decades, and has been found to\noften exhibit certain structural patterns of bias that align with human\nhandling of social relationships. To date, however, we know little about how\nLLMs behave in analogous graph recall tasks: do their recalled graphs also\nexhibit certain biased patterns, and if so, how do they compare with humans and\naffect other graph reasoning tasks? In this work, we perform the first\nsystematical study of graph recall by LLMs, investigating the accuracy and\nbiased microstructures (local structural patterns) in their recall. We find\nthat LLMs not only underperform often in graph recall, but also tend to favor\nmore triangles and alternating 2-paths. Moreover, we find that more advanced\nLLMs have a striking dependence on the domain that a real-world graph comes\nfrom -- by yielding the best recall accuracy when the graph is narrated in a\nlanguage style consistent with its original domain.\n","authors":["Yanbang Wang","Hejie Cui","Jon Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2402.11821v3.pdf","comment":"Accepted at NeurIPS 2024; Code available at:\n https://github.com/Abel0828/llm-graph-recall"},{"id":"http://arxiv.org/abs/2307.02147v4","updated":"2024-10-31T02:54:38Z","published":"2023-07-05T09:42:51Z","title":"Recommendation Unlearning via Influence Function","summary":" Recommendation unlearning is an emerging task to serve users for erasing\nunusable data (e.g., some historical behaviors) from a well-trained recommender\nmodel. Existing methods process unlearning requests by fully or partially\nretraining the model after removing the unusable data. However, these methods\nare impractical due to the high computation cost of full retraining and the\nhighly possible performance damage of partial training. In this light, a\ndesired recommendation unlearning method should obtain a similar model as full\nretraining in a more efficient manner, i.e., achieving complete, efficient and\nharmless unlearning.\n In this work, we propose a new Influence Function-based Recommendation\nUnlearning (IFRU) framework, which efficiently updates the model without\nretraining by estimating the influence of the unusable data on the model via\nthe influence function. In the light that recent recommender models use\nhistorical data for both the constructions of the optimization loss and the\ncomputational graph (e.g., neighborhood aggregation), IFRU jointly estimates\nthe direct influence of unusable data on optimization loss and the spillover\ninfluence on the computational graph to pursue complete unlearning.\nFurthermore, we propose an importance-based pruning algorithm to reduce the\ncost of the influence function. IFRU is harmless and applicable to mainstream\ndifferentiable models. Extensive experiments demonstrate that IFRU achieves\nmore than 250 times acceleration compared to retraining-based methods with\nrecommendation performance comparable to full retraining. Codes are avaiable at\nhttps://github.com/baiyimeng/IFRU.\n","authors":["Yang Zhang","Zhiyu Hu","Yimeng Bai","Jiancan Wu","Qifan Wang","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2307.02147v4.pdf","comment":"Accepted by ACM TORS"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2405.00065v3","updated":"2024-10-31T23:57:04Z","published":"2024-04-27T06:19:30Z","title":"From Linear to Linearizable Optimization: A Novel Framework with\n Applications to Stationary and Non-stationary DR-submodular Optimization","summary":" This paper introduces the notion of upper-linearizable/quadratizable\nfunctions, a class that extends concavity and DR-submodularity in various\nsettings, including monotone and non-monotone cases over different convex sets.\nA general meta-algorithm is devised to convert algorithms for linear/quadratic\nmaximization into ones that optimize upper-linearizable/quadratizable\nfunctions, offering a unified approach to tackling concave and DR-submodular\noptimization problems. The paper extends these results to multiple feedback\nsettings, facilitating conversions between semi-bandit/first-order feedback and\nbandit/zeroth-order feedback, as well as between first/zeroth-order feedback\nand semi-bandit/bandit feedback. Leveraging this framework, new algorithms are\nderived using existing results as base algorithms for convex optimization,\nimproving upon state-of-the-art results in various cases. Dynamic and adaptive\nregret guarantees are obtained for DR-submodular maximization, marking the\nfirst algorithms to achieve such guarantees in these settings. Notably, the\npaper achieves these advancements with fewer assumptions compared to existing\nstate-of-the-art results, underscoring its broad applicability and theoretical\ncontributions to non-convex optimization.\n","authors":["Mohammad Pedramfar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2405.00065v3.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2409.07510v3","updated":"2024-10-31T23:50:54Z","published":"2024-09-11T17:58:39Z","title":"Still More Shades of Null: An Evaluation Suite for Responsible Missing\n Value Imputation","summary":" Data missingness is a practical challenge of sustained interest to the\nscientific community. In this paper, we present Shades-of-Null, an evaluation\nsuite for responsible missing value imputation. Our work is novel in two ways\n(i) we model realistic and socially-salient missingness scenarios that go\nbeyond Rubin's classic Missing Completely at Random (MCAR), Missing At Random\n(MAR) and Missing Not At Random (MNAR) settings, to include multi-mechanism\nmissingness (when different missingness patterns co-exist in the data) and\nmissingness shift (when the missingness mechanism changes between training and\ntest) (ii) we evaluate imputers holistically, based on imputation quality, as\nwell as on the predictive performance, fairness and stability of the models\nthat are trained and tested on the data post-imputation.\n We use Shades-of-Null to conduct a large-scale empirical study involving\n23,940 experimental pipelines, and find that while there is no single\nbest-performing imputation approach for all missingness types, interesting\ntrade-offs arise between predictive performance, fairness and stability, based\non the combination of missingness scenario, imputer choice, and the\narchitecture of the predictive model. We make Shades-of-Null publicly\navailable, to enable researchers to rigorously evaluate missing value\nimputation methods on a wide range of metrics in plausible and socially\nmeaningful scenarios.\n","authors":["Falaah Arif Khan","Denys Herasymuk","Nazar Protsiv","Julia Stoyanovich"],"pdf_url":"https://arxiv.org/pdf/2409.07510v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17708v2","updated":"2024-10-31T23:40:16Z","published":"2024-05-27T23:51:20Z","title":"OPERA: Automatic Offline Policy Evaluation with Re-weighted Aggregates\n of Multiple Estimators","summary":" Offline policy evaluation (OPE) allows us to evaluate and estimate a new\nsequential decision-making policy's performance by leveraging historical\ninteraction data collected from other policies. Evaluating a new policy online\nwithout a confident estimate of its performance can lead to costly, unsafe, or\nhazardous outcomes, especially in education and healthcare. Several OPE\nestimators have been proposed in the last decade, many of which have\nhyperparameters and require training. Unfortunately, choosing the best OPE\nalgorithm for each task and domain is still unclear. In this paper, we propose\na new algorithm that adaptively blends a set of OPE estimators given a dataset\nwithout relying on an explicit selection using a statistical procedure. We\nprove that our estimator is consistent and satisfies several desirable\nproperties for policy evaluation. Additionally, we demonstrate that when\ncompared to alternative approaches, our estimator can be used to select\nhigher-performing policies in healthcare and robotics. Our work contributes to\nimproving ease of use for a general-purpose, estimator-agnostic, off-policy\nevaluation framework for offline RL.\n","authors":["Allen Nie","Yash Chandak","Christina J. Yuan","Anirudhan Badrinath","Yannis Flet-Berliac","Emma Brunskil"],"pdf_url":"https://arxiv.org/pdf/2405.17708v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2402.00347v2","updated":"2024-10-31T23:37:11Z","published":"2024-02-01T05:28:28Z","title":"Diverse Explanations From Data-Driven and Domain-Driven Perspectives in\n the Physical Sciences","summary":" Machine learning methods have been remarkably successful in material science,\nproviding novel scientific insights, guiding future laboratory experiments, and\naccelerating materials discovery. Despite the promising performance of these\nmodels, understanding the decisions they make is also essential to ensure the\nscientific value of their outcomes. However, there is a recent and ongoing\ndebate about the diversity of explanations, which potentially leads to\nscientific inconsistency. This Perspective explores the sources and\nimplications of these diverse explanations in ML applications for physical\nsciences. Through three case studies in materials science and molecular\nproperty prediction, we examine how different models, explanation methods,\nlevels of feature attribution, and stakeholder needs can result in varying\ninterpretations of ML outputs. Our analysis underscores the importance of\nconsidering multiple perspectives when interpreting ML models in scientific\ncontexts and highlights the critical need for scientists to maintain control\nover the interpretation process, balancing data-driven insights with domain\nexpertise to meet specific scientific needs. By fostering a comprehensive\nunderstanding of these inconsistencies, we aim to contribute to the responsible\nintegration of eXplainable Artificial Intelligence (XAI) into physical sciences\nand improve the trustworthiness of ML applications in scientific discovery.\n","authors":["Sichao Li","Xin Wang","Amanda Barnard"],"pdf_url":"https://arxiv.org/pdf/2402.00347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08199v3","updated":"2024-10-31T23:34:57Z","published":"2024-03-13T02:53:52Z","title":"Deep Submodular Peripteral Networks","summary":" Submodular functions, crucial for various applications, often lack practical\nlearning methods for their acquisition. Seemingly unrelated, learning a scaling\nfrom oracles offering graded pairwise preferences (GPC) is underexplored,\ndespite a rich history in psychometrics. In this paper, we introduce deep\nsubmodular peripteral networks (DSPNs), a novel parametric family of submodular\nfunctions, and methods for their training using a GPC-based strategy to connect\nand then tackle both of the above challenges. We introduce newly devised\nGPC-style ``peripteral'' loss which leverages numerically graded relationships\nbetween pairs of objects (sets in our case). Unlike traditional contrastive\nlearning, or RHLF preference ranking, our method utilizes graded comparisons,\nextracting more nuanced information than just binary-outcome comparisons, and\ncontrasts sets of any size (not just two). We also define a novel suite of\nautomatic sampling strategies for training, including active-learning inspired\nsubmodular feedback. We demonstrate DSPNs' efficacy in learning submodularity\nfrom a costly target submodular function and demonstrate its superiority both\nfor experimental design and online streaming applications.\n","authors":["Gantavya Bhatt","Arnav Das","Jeff Bilmes"],"pdf_url":"https://arxiv.org/pdf/2403.08199v3.pdf","comment":"Accepted at NeurIPS 2024 as spotlight presentation"},{"id":"http://arxiv.org/abs/2402.05274v2","updated":"2024-10-31T23:14:00Z","published":"2024-02-07T21:43:57Z","title":"Convergence for Natural Policy Gradient on Infinite-State Queueing MDPs","summary":" A wide variety of queueing systems can be naturally modeled as infinite-state\nMarkov Decision Processes (MDPs). In the reinforcement learning (RL) context, a\nvariety of algorithms have been developed to learn and optimize these MDPs. At\nthe heart of many popular policy-gradient based learning algorithms, such as\nnatural actor-critic, TRPO, and PPO, lies the Natural Policy Gradient (NPG)\npolicy optimization algorithm. Convergence results for these RL algorithms rest\non convergence results for the NPG algorithm. However, all existing results on\nthe convergence of the NPG algorithm are limited to finite-state settings.\n We study a general class of queueing MDPs, and prove a $O(1/\\sqrt{T})$\nconvergence rate for the NPG algorithm, if the NPG algorithm is initialized\nwith the MaxWeight policy. This is the first convergence rate bound for the NPG\nalgorithm for a general class of infinite-state average-reward MDPs. Moreover,\nour result applies to a beyond the queueing setting to any countably-infinite\nMDP satisfying certain mild structural assumptions, given a sufficiently good\ninitial policy. Key to our result are state-dependent bounds on the relative\nvalue function achieved by the iterate policies of the NPG algorithm.\n","authors":["Isaac Grosof","Siva Theja Maguluri","R. Srikant"],"pdf_url":"https://arxiv.org/pdf/2402.05274v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2404.13344v2","updated":"2024-10-31T23:12:29Z","published":"2024-04-20T10:44:13Z","title":"GRANOLA: Adaptive Normalization for Graph Neural Networks","summary":" In recent years, significant efforts have been made to refine the design of\nGraph Neural Network (GNN) layers, aiming to overcome diverse challenges, such\nas limited expressive power and oversmoothing. Despite their widespread\nadoption, the incorporation of off-the-shelf normalization layers like\nBatchNorm or InstanceNorm within a GNN architecture may not effectively capture\nthe unique characteristics of graph-structured data, potentially reducing the\nexpressive power of the overall architecture. Moreover, existing graph-specific\nnormalization layers often struggle to offer substantial and consistent\nbenefits. In this paper, we propose GRANOLA, a novel graph-adaptive\nnormalization layer. Unlike existing normalization layers, GRANOLA normalizes\nnode features by adapting to the specific characteristics of the graph,\nparticularly by generating expressive representations of its neighborhood\nstructure, obtained by leveraging the propagation of Random Node Features (RNF)\nin the graph. We present theoretical results that support our design choices.\nOur extensive empirical evaluation of various graph benchmarks underscores the\nsuperior performance of GRANOLA over existing normalization techniques.\nFurthermore, GRANOLA emerges as the top-performing method among all baselines\nwithin the same time complexity of Message Passing Neural Networks (MPNNs).\n","authors":["Moshe Eliasof","Beatrice Bevilacqua","Carola-Bibiane Schönlieb","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2404.13344v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.12036v2","updated":"2024-10-31T23:08:03Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11697v2","updated":"2024-10-31T22:55:21Z","published":"2024-09-18T04:36:05Z","title":"Monomial Matrix Group Equivariant Neural Functional Networks","summary":" Neural functional networks (NFNs) have recently gained significant attention\ndue to their diverse applications, ranging from predicting network\ngeneralization and network editing to classifying implicit neural\nrepresentation. Previous NFN designs often depend on permutation symmetries in\nneural networks' weights, which traditionally arise from the unordered\narrangement of neurons in hidden layers. However, these designs do not take\ninto account the weight scaling symmetries of $\\ReLU$ networks, and the weight\nsign flipping symmetries of $\\sin$ or $\\Tanh$ networks. In this paper, we\nextend the study of the group action on the network weights from the group of\npermutation matrices to the group of monomial matrices by incorporating\nscaling/sign-flipping symmetries. Particularly, we encode these\nscaling/sign-flipping symmetries by designing our corresponding equivariant and\ninvariant layers. We name our new family of NFNs the Monomial Matrix Group\nEquivariant Neural Functional Networks (Monomial-NFN). Because of the expansion\nof the symmetries, Monomial-NFN has much fewer independent trainable parameters\ncompared to the baseline NFNs in the literature, thus enhancing the model's\nefficiency. Moreover, for fully connected and convolutional neural networks, we\ntheoretically prove that all groups that leave these networks invariant while\nacting on their weight spaces are some subgroups of the monomial matrix group.\nWe provide empirical evidence to demonstrate the advantages of our model over\nexisting baselines, achieving competitive performance and efficiency.\n","authors":["Hoang V. Tran","Thieu N. Vo","Tho H. Tran","An T. Nguyen","Tan M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.11697v2.pdf","comment":"10 pages in the main text. Published at NeurIPS 2024. The code is\n available at https://github.com/MathematicalAI-NUS/Monomial-NFN"},{"id":"http://arxiv.org/abs/2405.15282v2","updated":"2024-10-31T22:29:59Z","published":"2024-05-24T07:11:42Z","title":"Prompt Tuning Strikes Back: Customizing Foundation Models with Low-Rank\n Prompt Adaptation","summary":" Parameter-Efficient Fine-Tuning (PEFT) has become the standard for\ncustomising Foundation Models (FMs) to user-specific downstream tasks. However,\ntypical PEFT methods require storing multiple task-specific adapters, creating\nscalability issues as these adapters must be housed and run at the FM server.\nTraditional prompt tuning offers a potential solution by customising them\nthrough task-specific input prefixes, but it under-performs compared to other\nPEFT methods like LoRA. To address this gap, we propose Low-Rank Prompt\nAdaptation (LoPA), a prompt-tuning-based approach that performs on par with\nstate-of-the-art PEFT methods and full fine-tuning while being more\nparameter-efficient and not requiring a server-based adapter. LoPA generates\nsoft prompts by balancing between sharing task-specific information across\ninstances and customization for each instance. It uses a low-rank decomposition\nof the soft-prompt component encoded for each instance to achieve parameter\nefficiency. We provide a comprehensive evaluation on multiple natural language\nunderstanding and code generation and understanding tasks across a wide range\nof foundation models with varying sizes.\n","authors":["Abhinav Jain","Swarat Chaudhuri","Thomas Reps","Chris Jermaine"],"pdf_url":"https://arxiv.org/pdf/2405.15282v2.pdf","comment":"14 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.01318v5","updated":"2024-10-31T22:26:40Z","published":"2024-03-28T02:44:02Z","title":"JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large\n Language Models","summary":" Jailbreak attacks cause large language models (LLMs) to generate harmful,\nunethical, or otherwise objectionable content. Evaluating these attacks\npresents a number of challenges, which the current collection of benchmarks and\nevaluation techniques do not adequately address. First, there is no clear\nstandard of practice regarding jailbreaking evaluation. Second, existing works\ncompute costs and success rates in incomparable ways. And third, numerous works\nare not reproducible, as they withhold adversarial prompts, involve\nclosed-source code, or rely on evolving proprietary APIs. To address these\nchallenges, we introduce JailbreakBench, an open-sourced benchmark with the\nfollowing components: (1) an evolving repository of state-of-the-art\nadversarial prompts, which we refer to as jailbreak artifacts; (2) a\njailbreaking dataset comprising 100 behaviors -- both original and sourced from\nprior work (Zou et al., 2023; Mazeika et al., 2023, 2024) -- which align with\nOpenAI's usage policies; (3) a standardized evaluation framework at\nhttps://github.com/JailbreakBench/jailbreakbench that includes a clearly\ndefined threat model, system prompts, chat templates, and scoring functions;\nand (4) a leaderboard at https://jailbreakbench.github.io/ that tracks the\nperformance of attacks and defenses for various LLMs. We have carefully\nconsidered the potential ethical implications of releasing this benchmark, and\nbelieve that it will be a net positive for the community.\n","authors":["Patrick Chao","Edoardo Debenedetti","Alexander Robey","Maksym Andriushchenko","Francesco Croce","Vikash Sehwag","Edgar Dobriban","Nicolas Flammarion","George J. Pappas","Florian Tramer","Hamed Hassani","Eric Wong"],"pdf_url":"https://arxiv.org/pdf/2404.01318v5.pdf","comment":"The camera-ready version of JailbreakBench v1.0 (accepted at NeurIPS\n 2024 Datasets and Benchmarks Track): more attack artifacts, more test-time\n defenses, a more accurate jailbreak judge (Llama-3-70B with a custom prompt),\n a larger dataset of human preferences for selecting a jailbreak judge (300\n examples), an over-refusal evaluation dataset, a semantic refusal judge based\n on Llama-3-8B"},{"id":"http://arxiv.org/abs/2403.00177v3","updated":"2024-10-31T21:56:09Z","published":"2024-02-29T23:04:42Z","title":"Med-Real2Sim: Non-Invasive Medical Digital Twins using Physics-Informed\n Self-Supervised Learning","summary":" A digital twin is a virtual replica of a real-world physical phenomena that\nuses mathematical modeling to characterize and simulate its defining features.\nBy constructing digital twins for disease processes, we can perform in-silico\nsimulations that mimic patients' health conditions and counterfactual outcomes\nunder hypothetical interventions in a virtual setting. This eliminates the need\nfor invasive procedures or uncertain treatment decisions. In this paper, we\npropose a method to identify digital twin model parameters using only\nnoninvasive patient health data. We approach the digital twin modeling as a\ncomposite inverse problem, and observe that its structure resembles pretraining\nand finetuning in self-supervised learning (SSL). Leveraging this, we introduce\na physics-informed SSL algorithm that initially pretrains a neural network on\nthe pretext task of learning a differentiable simulator of a physiological\nprocess. Subsequently, the model is trained to reconstruct physiological\nmeasurements from noninvasive modalities while being constrained by the\nphysical equations learned in pretraining. We apply our method to identify\ndigital twins of cardiac hemodynamics using noninvasive echocardiogram videos,\nand demonstrate its utility in unsupervised disease detection and in-silico\nclinical trials.\n","authors":["Keying Kuang","Frances Dean","Jack B. Jedlicki","David Ouyang","Anthony Philippakis","David Sontag","Ahmed M. Alaa"],"pdf_url":"https://arxiv.org/pdf/2403.00177v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09721v4","updated":"2024-10-31T21:48:37Z","published":"2024-02-15T05:30:47Z","title":"Generalized Principal-Agent Problem with a Learning Agent","summary":" Classic principal-agent problems such as Stackelberg games, contract design,\nand Bayesian persuasion, often assume that the agent is able to best respond to\nthe principal's committed strategy. We study repeated generalized\nprincipal-agent problems under the assumption that the principal does not have\ncommitment power and the agent uses algorithms to learn to respond to the\nprincipal. We reduce this problem to a one-shot generalized principal-agent\nproblem where the agent approximately best responds. Using this reduction, we\nshow that: (1) If the agent uses contextual no-regret learning algorithms with\nregret $\\mathrm{Reg}(T)$, then the principal can guarantee utility at least\n$U^* - \\Theta\\big(\\sqrt{\\tfrac{\\mathrm{Reg}(T)}{T}}\\big)$, where $U^*$ is the\nprincipal's optimal utility in the classic model with a best-responding agent.\n(2) If the agent uses contextual no-swap-regret learning algorithms with\nswap-regret $\\mathrm{SReg}(T)$, then the principal cannot obtain utility more\nthan $U^* + O(\\frac{\\mathrm{SReg(T)}}{T})$. But (3) if the agent uses\nmean-based learning algorithms (which can be no-regret but not no-swap-regret),\nthen the principal can sometimes do significantly better than $U^*$. These\nresults not only refine previous results in Stackelberg games and contract\ndesign, but also lead to new results for Bayesian persuasion with a learning\nagent and all generalized principal-agent problems where the agent does not\nhave private information.\n","authors":["Tao Lin","Yiling Chen"],"pdf_url":"https://arxiv.org/pdf/2402.09721v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22296v2","updated":"2024-10-31T21:46:13Z","published":"2024-10-29T17:45:57Z","title":"LLMs are Highly-Constrained Biophysical Sequence Optimizers","summary":" Large language models (LLMs) have recently shown significant potential in\nvarious biological tasks such as protein engineering and molecule design. These\ntasks typically involve black-box discrete sequence optimization, where the\nchallenge lies in generating sequences that are not only biologically feasible\nbut also adhere to hard fine-grained constraints. However, LLMs often struggle\nwith such constraints, especially in biological contexts where verifying\ncandidate solutions is costly and time-consuming. In this study, we explore the\npossibility of employing LLMs as highly-constrained bilevel optimizers through\na methodology we refer to as Language Model Optimization with Margin\nExpectation (LLOME). This approach combines both offline and online\noptimization, utilizing limited oracle evaluations to iteratively enhance the\nsequences generated by the LLM. We additionally propose a novel training\nobjective -- Margin-Aligned Expectation (MargE) -- that trains the LLM to\nsmoothly interpolate between the reward and reference distributions. Lastly, we\nintroduce a synthetic test suite that bears strong geometric similarity to real\nbiophysical problems and enables rapid evaluation of LLM optimizers without\ntime-consuming lab validation. Our findings reveal that, in comparison to\ngenetic algorithm baselines, LLMs achieve significantly lower regret solutions\nwhile requiring fewer test function evaluations. However, we also observe that\nLLMs exhibit moderate miscalibration, are susceptible to generator collapse,\nand have difficulty finding the optimal solution when no explicit ground truth\nrewards are available.\n","authors":["Angelica Chen","Samuel D. Stanton","Robert G. Alberstein","Andrew M. Watkins","Richard Bonneau","Vladimir Gligorijevi","Kyunghyun Cho","Nathan C. Frey"],"pdf_url":"https://arxiv.org/pdf/2410.22296v2.pdf","comment":"Supercedes arXiv:2407.00236v1"},{"id":"http://arxiv.org/abs/2410.09614v2","updated":"2024-10-31T21:43:04Z","published":"2024-10-12T18:28:56Z","title":"Exploring Behavior-Relevant and Disentangled Neural Dynamics with\n Generative Diffusion Models","summary":" Understanding the neural basis of behavior is a fundamental goal in\nneuroscience. Current research in large-scale neuro-behavioral data analysis\noften relies on decoding models, which quantify behavioral information in\nneural data but lack details on behavior encoding. This raises an intriguing\nscientific question: ``how can we enable in-depth exploration of neural\nrepresentations in behavioral tasks, revealing interpretable neural dynamics\nassociated with behaviors''. However, addressing this issue is challenging due\nto the varied behavioral encoding across different brain regions and mixed\nselectivity at the population level. To tackle this limitation, our approach,\nnamed ``BeNeDiff'', first identifies a fine-grained and disentangled neural\nsubspace using a behavior-informed latent variable model. It then employs\nstate-of-the-art generative diffusion models to synthesize behavior videos that\ninterpret the neural dynamics of each latent factor. We validate the method on\nmulti-session datasets containing widefield calcium imaging recordings across\nthe dorsal cortex. Through guiding the diffusion model to activate individual\nlatent factors, we verify that the neural dynamics of latent factors in the\ndisentangled neural subspace provide interpretable quantifications of the\nbehaviors of interest. At the same time, the neural subspace in BeNeDiff\ndemonstrates high disentanglement and neural reconstruction quality.\n","authors":["Yule Wang","Chengrui Li","Weihan Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2410.09614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14755v3","updated":"2024-10-31T21:33:37Z","published":"2024-05-23T16:21:57Z","title":"Large language models can be zero-shot anomaly detectors for time\n series?","summary":" Recent studies have shown the ability of large language models to perform a\nvariety of tasks, including time series forecasting. The flexible nature of\nthese models allows them to be used for many applications. In this paper, we\npresent a novel study of large language models used for the challenging task of\ntime series anomaly detection. This problem entails two aspects novel for LLMs:\nthe need for the model to identify part of the input sequence (or multiple\nparts) as anomalous; and the need for it to work with time series data rather\nthan the traditional text input. We introduce sigllm, a framework for time\nseries anomaly detection using large language models. Our framework includes a\ntime-series-to-text conversion module, as well as end-to-end pipelines that\nprompt language models to perform time series anomaly detection. We investigate\ntwo paradigms for testing the abilities of large language models to perform the\ndetection task. First, we present a prompt-based detection method that directly\nasks a language model to indicate which elements of the input are anomalies.\nSecond, we leverage the forecasting capability of a large language model to\nguide the anomaly detection process. We evaluated our framework on 11 datasets\nspanning various sources and 10 pipelines. We show that the forecasting method\nsignificantly outperformed the prompting method in all 11 datasets with respect\nto the F1 score. Moreover, while large language models are capable of finding\nanomalies, state-of-the-art deep learning models are still superior in\nperformance, achieving results 30% better than large language models.\n","authors":["Sarah Alnegheimish","Linh Nguyen","Laure Berti-Equille","Kalyan Veeramachaneni"],"pdf_url":"https://arxiv.org/pdf/2405.14755v3.pdf","comment":"This work is accepted by IEEE International Conference on Data\n Science and Advanced Analytics (DSAA 2024)"},{"id":"http://arxiv.org/abs/2403.14597v3","updated":"2024-10-31T21:33:32Z","published":"2024-03-21T17:50:22Z","title":"Extended Reality for Enhanced Human-Robot Collaboration: a\n Human-in-the-Loop Approach","summary":" The rise of automation has provided an opportunity to achieve higher\nefficiency in manufacturing processes, yet it often compromises the flexibility\nrequired to promptly respond to evolving market needs and meet the demand for\ncustomization. Human-robot collaboration attempts to tackle these challenges by\ncombining the strength and precision of machines with human ingenuity and\nperceptual understanding. In this paper, we conceptualize and propose an\nimplementation framework for an autonomous, machine learning-based manipulator\nthat incorporates human-in-the-loop principles and leverages Extended Reality\n(XR) to facilitate intuitive communication and programming between humans and\nrobots. Furthermore, the conceptual framework foresees human involvement\ndirectly in the robot learning process, resulting in higher adaptability and\ntask generalization. The paper highlights key technologies enabling the\nproposed framework, emphasizing the importance of developing the digital\necosystem as a whole. Additionally, we review the existent implementation\napproaches of XR in human-robot collaboration, showcasing diverse perspectives\nand methodologies. The challenges and future outlooks are discussed, delving\ninto the major obstacles and potential research avenues of XR for more natural\nhuman-robot interaction and integration in the industrial landscape.\n","authors":["Yehor Karpichev","Todd Charter","Jayden Hong","Amir M. Soufi Enayati","Homayoun Honari","Mehran Ghafarian Tamizi","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2403.14597v3.pdf","comment":"Published in IEEE International Conference on Robot and Human\n Interactive Communication (RO-MAN) 2024"},{"id":"http://arxiv.org/abs/2407.14094v2","updated":"2024-10-31T21:26:01Z","published":"2024-07-19T07:58:26Z","title":"User-Creator Feature Polarization in Recommender Systems with Dual\n Influence","summary":" Recommender systems serve the dual purpose of presenting relevant content to\nusers and helping content creators reach their target audience. The dual nature\nof these systems naturally influences both users and creators: users'\npreferences are affected by the items they are recommended, while creators may\nbe incentivized to alter their content to attract more users. We define a\nmodel, called user-creator feature dynamics, to capture the dual influence of\nrecommender systems. We prove that a recommender system with dual influence is\nguaranteed to polarize, causing diversity loss in the system. We then\ninvestigate, both theoretically and empirically, approaches for mitigating\npolarization and promoting diversity in recommender systems. Unexpectedly, we\nfind that common diversity-promoting approaches do not work in the presence of\ndual influence, while relevancy-optimizing methods like top-$k$ truncation can\nprevent polarization and improve diversity of the system.\n","authors":["Tao Lin","Kun Jin","Andrew Estornell","Xiaoying Zhang","Yiling Chen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14094v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.13770v2","updated":"2024-10-31T21:21:26Z","published":"2024-06-19T18:38:11Z","title":"Elliptical Attention","summary":" Pairwise dot-product self-attention is key to the success of transformers\nthat achieve state-of-the-art performance across a variety of applications in\nlanguage and vision. This dot-product self-attention computes attention weights\namong the input tokens using Euclidean distance, which makes the model prone to\nrepresentation collapse and vulnerable to contaminated samples. In this paper,\nwe propose using a Mahalanobis distance metric for computing the attention\nweights to stretch the underlying feature space in directions of high\ncontextual relevance. In particular, we define a hyper-ellipsoidal neighborhood\naround each query to increase the attention weights of the tokens lying in the\ncontextually important directions. We term this novel class of attention\nElliptical Attention. Our Elliptical Attention provides two benefits: 1)\nreducing representation collapse and 2) enhancing the model's robustness as\nElliptical Attention pays more attention to contextually relevant information\nrather than focusing on some small subset of informative features. We\nempirically demonstrate the advantages of Elliptical Attention over the\nbaseline dot-product attention and state-of-the-art attention methods on\nvarious practical tasks, including object classification, image segmentation,\nand language modeling across different data modalities.\n","authors":["Stefan K. Nielsen","Laziz U. Abdullaev","Rachel S. Y. Teo","Tan M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.13770v2.pdf","comment":"10 pages in the main text. Published at NeurIPS 2024. The code is\n available at https://github.com/stefvk/Elliptical-Attention"},{"id":"http://arxiv.org/abs/2407.19320v3","updated":"2024-10-31T21:15:11Z","published":"2024-07-27T18:33:10Z","title":"WindsorML: High-Fidelity Computational Fluid Dynamics Dataset For\n Automotive Aerodynamics","summary":" This paper presents a new open-source high-fidelity dataset for Machine\nLearning (ML) containing 355 geometric variants of the Windsor body, to help\nthe development and testing of ML surrogate models for external automotive\naerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a\nGPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a\nCartesian immersed-boundary method using more than 280M cells to ensure the\ngreatest possible accuracy. The dataset contains geometry variants that\nexhibits a wide range of flow characteristics that are representative of those\nobserved on road-cars. The dataset itself contains the 3D time-averaged volume\n& boundary data as well as the geometry and force & moment coefficients. This\npaper discusses the validation of the underlying CFD methods as well as\ncontents and structure of the dataset. To the authors knowledge, this\nrepresents the first, large-scale high-fidelity CFD dataset for the Windsor\nbody with a permissive open-source license (CC-BY-SA).\n","authors":["Neil Ashton","Jordan B. Angel","Aditya S. Ghate","Gaetan K. W. Kenway","Man Long Wong","Cetin Kiris","Astrid Walle","Danielle C. Maddix","Gary Page"],"pdf_url":"https://arxiv.org/pdf/2407.19320v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23131v2","updated":"2024-10-31T20:52:35Z","published":"2024-10-30T15:41:35Z","title":"Federated Learning under Periodic Client Participation and Heterogeneous\n Data: A New Communication-Efficient Algorithm and Analysis","summary":" In federated learning, it is common to assume that clients are always\navailable to participate in training, which may not be feasible with user\ndevices in practice. Recent works analyze federated learning under more\nrealistic participation patterns, such as cyclic client availability or\narbitrary participation. However, all such works either require strong\nassumptions (e.g., all clients participate almost surely within a bounded\nwindow), do not achieve linear speedup and reduced communication rounds, or are\nnot applicable in the general non-convex setting. In this work, we focus on\nnonconvex optimization and consider participation patterns in which the chance\nof participation over a fixed window of rounds is equal among all clients,\nwhich includes cyclic client availability as a special case. Under this\nsetting, we propose a new algorithm, named Amplified SCAFFOLD, and prove that\nit achieves linear speedup, reduced communication, and resilience to data\nheterogeneity simultaneously. In particular, for cyclic participation, our\nalgorithm is proved to enjoy $\\mathcal{O}(\\epsilon^{-2})$ communication rounds\nto find an $\\epsilon$-stationary point in the non-convex stochastic setting. In\ncontrast, the prior work under the same setting requires $\\mathcal{O}(\\kappa^2\n\\epsilon^{-4})$ communication rounds, where $\\kappa$ denotes the data\nheterogeneity. Therefore, our algorithm significantly reduces communication\nrounds due to better dependency in terms of $\\epsilon$ and $\\kappa$. Our\nanalysis relies on a fine-grained treatment of the nested dependence between\nclient participation and errors in the control variates, which results in\ntighter guarantees than previous work. We also provide experimental results\nwith (1) synthetic data and (2) real-world data with a large number of clients\n$(N = 250)$, demonstrating the effectiveness of our algorithm under periodic\nclient participation.\n","authors":["Michael Crawshaw","Mingrui Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23131v2.pdf","comment":"Neurips 2024"},{"id":"http://arxiv.org/abs/2406.09277v2","updated":"2024-10-31T20:45:16Z","published":"2024-06-13T16:15:53Z","title":"End-to-end streaming model for low-latency speech anonymization","summary":" Speaker anonymization aims to conceal cues to speaker identity while\npreserving linguistic content. Current machine learning based approaches\nrequire substantial computational resources, hindering real-time streaming\napplications. To address these concerns, we propose a streaming model that\nachieves speaker anonymization with low latency. The system is trained in an\nend-to-end autoencoder fashion using a lightweight content encoder that\nextracts HuBERT-like information, a pretrained speaker encoder that extract\nspeaker identity, and a variance encoder that injects pitch and energy\ninformation. These three disentangled representations are fed to a decoder that\nre-synthesizes the speech signal. We present evaluation results from two\nimplementations of our system, a full model that achieves a latency of 230ms,\nand a lite version (0.1x in size) that further reduces latency to 66ms while\nmaintaining state-of-the-art performance in naturalness, intelligibility, and\nprivacy preservation.\n","authors":["Waris Quamer","Ricardo Gutierrez-Osuna"],"pdf_url":"https://arxiv.org/pdf/2406.09277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08571v2","updated":"2024-10-31T20:30:51Z","published":"2024-07-11T14:59:17Z","title":"Multi-Group Proportional Representation in Retrieval","summary":" Image search and retrieval tasks can perpetuate harmful stereotypes, erase\ncultural identities, and amplify social disparities. Current approaches to\nmitigate these representational harms balance the number of retrieved items\nacross population groups defined by a small number of (often binary)\nattributes. However, most existing methods overlook intersectional groups\ndetermined by combinations of group attributes, such as gender, race, and\nethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel\nmetric that measures representation across intersectional groups. We develop\npractical methods for estimating MPR, provide theoretical guarantees, and\npropose optimization algorithms to ensure MPR in retrieval. We demonstrate that\nexisting methods optimizing for equal and proportional representation metrics\nmay fail to promote MPR. Crucially, our work shows that optimizing MPR yields\nmore proportional representation across multiple intersectional groups\nspecified by a rich function class, often with minimal compromise in retrieval\naccuracy.\n","authors":["Alex Oesterling","Claudio Mayrink Verdun","Carol Xuan Long","Alexander Glynn","Lucas Monteiro Paes","Sajani Vithana","Martina Cardone","Flavio P. Calmon"],"pdf_url":"https://arxiv.org/pdf/2407.08571v2.pdf","comment":"48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be\n found at\n https://github.com/alex-oesterling/multigroup-proportional-representation"},{"id":"http://arxiv.org/abs/2409.06077v2","updated":"2024-10-31T19:52:26Z","published":"2024-09-09T21:20:36Z","title":"MTLSO: A Multi-Task Learning Approach for Logic Synthesis Optimization","summary":" Electronic Design Automation (EDA) is essential for IC design and has\nrecently benefited from AI-based techniques to improve efficiency. Logic\nsynthesis, a key EDA stage, transforms high-level hardware descriptions into\noptimized netlists. Recent research has employed machine learning to predict\nQuality of Results (QoR) for pairs of And-Inverter Graphs (AIGs) and synthesis\nrecipes. However, the severe scarcity of data due to a very limited number of\navailable AIGs results in overfitting, significantly hindering performance.\nAdditionally, the complexity and large number of nodes in AIGs make plain GNNs\nless effective for learning expressive graph-level representations. To tackle\nthese challenges, we propose MTLSO - a Multi-Task Learning approach for Logic\nSynthesis Optimization. On one hand, it maximizes the use of limited data by\ntraining the model across different tasks. This includes introducing an\nauxiliary task of binary multi-label graph classification alongside the primary\nregression task, allowing the model to benefit from diverse supervision\nsources. On the other hand, we employ a hierarchical graph representation\nlearning strategy to improve the model's capacity for learning expressive\ngraph-level representations of large AIGs, surpassing traditional plain GNNs.\nExtensive experiments across multiple datasets and against state-of-the-art\nbaselines demonstrate the superiority of our method, achieving an average\nperformance gain of 8.22\\% for delay and 5.95\\% for area.\n","authors":["Faezeh Faez","Raika Karimi","Yingxue Zhang","Xing Li","Lei Chen","Mingxuan Yuan","Mahdi Biparva"],"pdf_url":"https://arxiv.org/pdf/2409.06077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21335v2","updated":"2024-10-31T19:49:29Z","published":"2024-10-27T19:59:09Z","title":"E(3)-invariant diffusion model for pocket-aware peptide generation","summary":" Biologists frequently desire protein inhibitors for a variety of reasons,\nincluding use as research tools for understanding biological processes and\napplication to societal problems in agriculture, healthcare, etc.\nImmunotherapy, for instance, relies on immune checkpoint inhibitors to block\ncheckpoint proteins, preventing their binding with partner proteins and\nboosting immune cell function against abnormal cells. Inhibitor discovery has\nlong been a tedious process, which in recent years has been accelerated by\ncomputational approaches. Advances in artificial intelligence now provide an\nopportunity to make inhibitor discovery smarter than ever before. While\nextensive research has been conducted on computer-aided inhibitor discovery, it\nhas mainly focused on either sequence-to-structure mapping, reverse mapping, or\nbio-activity prediction, making it unrealistic for biologists to utilize such\ntools. Instead, our work proposes a new method of computer-assisted inhibitor\ndiscovery: de novo pocket-aware peptide structure and sequence generation\nnetwork. Our approach consists of two sequential diffusion models for\nend-to-end structure generation and sequence prediction. By leveraging angle\nand dihedral relationships between backbone atoms, we ensure an E(3)-invariant\nrepresentation of peptide structures. Our results demonstrate that our method\nachieves comparable performance to state-of-the-art models, highlighting its\npotential in pocket-aware peptide design. This work offers a new approach for\nprecise drug discovery using receptor-specific peptide generation.\n","authors":["Po-Yu Liang","Jun Bai"],"pdf_url":"https://arxiv.org/pdf/2410.21335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21302v2","updated":"2024-10-31T19:44:26Z","published":"2024-10-21T22:52:25Z","title":"Domain-Adaptive Pre-training of Self-Supervised Foundation Models for\n Medical Image Classification in Gastrointestinal Endoscopy","summary":" Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE)\ndiagnostics by offering a non-invasive method for capturing detailed images of\nthe gastrointestinal tract, enabling early disease detection. However, its\npotential is limited by the sheer volume of images generated during the imaging\nprocedure, which can take anywhere from 6-8 hours and often produce up to 1\nmillion images, necessitating automated analysis. Additionally, the variability\nof these images, combined with the need for expert annotations and the scarcity\nof large, high-quality labeled datasets, constrains the effectiveness of\ncurrent medical image analysis models. To address this, we introduce a novel\nlarge gastrointestinal endoscopy dataset, called EndoExtend24, created by\nmerging and re-stratifying the train/test splits of ten existing public and\nprivate datasets, ensuring no overlap of patient data across splits.\nEndoExtend24 includes over 226,000 labeled images, as well as dynamic class\nmappings, which allow unified training across datasets with differing labeling\ngranularity, supporting up to 123 distinct pathological findings. Further, we\npropose to leverage domain adaptive pre-training of foundation models in\ncomputer vision trained with self-supervision on generic image data, to adapt\nthem to the task of GIE medical diagnosis. Specifically, the EVA-02 model,\nwhich is based on the vision transformer architecture and was trained on\nImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is\npre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and\nfinally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental\nresults demonstrate strong performance with an F1 score of 0.88, an improvement\nof about 39% over the baseline model's F1 score of 0.49. Additionally, the\nmodel achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%.\n","authors":["Marcel Roth","Micha V. Nowak","Adrian Krenzer","Frank Puppe"],"pdf_url":"https://arxiv.org/pdf/2410.21302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06484v3","updated":"2024-10-31T19:35:47Z","published":"2024-06-10T17:24:42Z","title":"Parallelizing Linear Transformers with the Delta Rule over Sequence\n Length","summary":" Transformers with linear attention (i.e., linear transformers) and\nstate-space models have recently been suggested as a viable linear-time\nalternative to transformers with softmax attention. However, these models still\nunderperform transformers especially on tasks that require in-context\nretrieval. While more expressive variants of linear transformers which replace\nthe additive update in linear transformers with the delta rule (DeltaNet) have\nbeen found to be more effective at associative recall, existing algorithms for\ntraining such models do not parallelize over sequence length and are thus\ninefficient to train on modern hardware. This work describes a\nhardware-efficient algorithm for training linear transformers with the delta\nrule, which exploits a memory-efficient representation for computing products\nof Householder matrices. This algorithm allows us to scale up DeltaNet to\nstandard language modeling settings. We train a 1.3B model for 100B tokens and\nfind that it outperforms recent linear-time baselines such as Mamba and GLA in\nterms of perplexity and zero-shot performance on downstream tasks. We also\nexperiment with two hybrid models which combine DeltaNet layers with (1)\nsliding-window attention layers every other layer or (2) two global attention\nlayers, and find that these hybrids outperform strong transformer baselines.\n","authors":["Songlin Yang","Bailin Wang","Yu Zhang","Yikang Shen","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2406.06484v3.pdf","comment":"NeurIPS 2024 camera ready"},{"id":"http://arxiv.org/abs/2406.12649v3","updated":"2024-10-31T19:30:46Z","published":"2024-06-18T14:17:57Z","title":"Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations\n for Vision Foundation Models","summary":" Vision transformers (ViTs) have emerged as a significant area of focus,\nparticularly for their capacity to be jointly trained with large language\nmodels and to serve as robust vision foundation models. Yet, the development of\ntrustworthy explanation methods for ViTs has lagged, particularly in the\ncontext of post-hoc interpretations of ViT predictions. Existing sub-image\nselection approaches, such as feature-attribution and conceptual models, fall\nshort in this regard. This paper proposes five desiderata for explaining ViTs\n-- faithfulness, stability, sparsity, multi-level structure, and parsimony --\nand demonstrates the inadequacy of current methods in meeting these criteria\ncomprehensively. We introduce a variational Bayesian explanation framework,\ndubbed ProbAbilistic Concept Explainers (PACE), which models the distributions\nof patch embeddings to provide trustworthy post-hoc conceptual explanations.\nOur qualitative analysis reveals the distributions of patch-level concepts,\nelucidating the effectiveness of ViTs by modeling the joint distribution of\npatch embeddings and ViT's predictions. Moreover, these patch-level\nexplanations bridge the gap between image-level and dataset-level explanations,\nthus completing the multi-level structure of PACE. Through extensive\nexperiments on both synthetic and real-world datasets, we demonstrate that PACE\nsurpasses state-of-the-art methods in terms of the defined desiderata.\n","authors":["Hengyi Wang","Shiwei Tan","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12649v3.pdf","comment":"Proceedings of the 41st International Conference on Machine Learning\n (ICML 2024)"},{"id":"http://arxiv.org/abs/2410.10648v3","updated":"2024-10-31T19:26:43Z","published":"2024-10-14T15:59:16Z","title":"A Simple Baseline for Predicting Events with Auto-Regressive Tabular\n Transformers","summary":" Many real-world applications of tabular data involve using historic events to\npredict properties of new ones, for example whether a credit card transaction\nis fraudulent or what rating a customer will assign a product on a retail\nplatform. Existing approaches to event prediction include costly, brittle, and\napplication-dependent techniques such as time-aware positional embeddings,\nlearned row and field encodings, and oversampling methods for addressing class\nimbalance. Moreover, these approaches often assume specific use-cases, for\nexample that we know the labels of all historic events or that we only predict\na pre-specified label and not the data's features themselves. In this work, we\npropose a simple but flexible baseline using standard autoregressive LLM-style\ntransformers with elementary positional embeddings and a causal language\nmodeling objective. Our baseline outperforms existing approaches across popular\ndatasets and can be employed for various use-cases. We demonstrate that the\nsame model can predict labels, impute missing values, or model event sequences.\n","authors":["Alex Stein","Samuel Sharpe","Doron Bergman","Senthil Kumar","C. Bayan Bruss","John Dickerson","Tom Goldstein","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2410.10648v3.pdf","comment":"10 pages, 6 pages of references+appendix"},{"id":"http://arxiv.org/abs/2310.02396v4","updated":"2024-10-31T19:22:01Z","published":"2023-10-03T19:39:30Z","title":"Inductive biases of multi-task learning and finetuning: multiple regimes\n of feature reuse","summary":" Neural networks are often trained on multiple tasks, either simultaneously\n(multi-task learning, MTL) or sequentially (pretraining and subsequent\nfinetuning, PT+FT). In particular, it is common practice to pretrain neural\nnetworks on a large auxiliary task before finetuning on a downstream task with\nfewer samples. Despite the prevalence of this approach, the inductive biases\nthat arise from learning multiple tasks are poorly characterized. In this work,\nwe address this gap. We describe novel implicit regularization penalties\nassociated with MTL and PT+FT in diagonal linear networks and\nsingle-hidden-layer ReLU networks. These penalties indicate that MTL and PT+FT\ninduce the network to reuse features in different ways. 1) Both MTL and PT+FT\nexhibit biases towards feature reuse between tasks, and towards sparsity in the\nset of learned features. We show a \"conservation law\" that implies a direct\ntradeoff between these two biases. 2) PT+FT exhibits a novel \"nested feature\nselection\" regime, not described by either the \"lazy\" or \"rich\" regimes\nidentified in prior work, which biases it to rely on a sparse subset of the\nfeatures learned during pretraining. This regime is much narrower for MTL. 3)\nPT+FT (but not MTL) in ReLU networks benefits from features that are correlated\nbetween the auxiliary and main task. We confirm these findings empirically with\nteacher-student models, and introduce a technique -- weight rescaling following\npretraining -- that can elicit the nested feature selection regime. Finally, we\nvalidate our theory in deep neural networks trained on image classification. We\nfind that weight rescaling improves performance when it causes models to\ndisplay signatures of nested feature selection. Our results suggest that nested\nfeature selection may be an important inductive bias for finetuning neural\nnetworks.\n","authors":["Samuel Lippl","Jack W. Lindsey"],"pdf_url":"https://arxiv.org/pdf/2310.02396v4.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.07457v3","updated":"2024-10-31T19:10:41Z","published":"2024-06-11T17:01:52Z","title":"Estimating the Hallucination Rate of Generative AI","summary":" This paper presents a method for estimating the hallucination rate for\nin-context learning (ICL) with generative AI. In ICL, a conditional generative\nmodel (CGM) is prompted with a dataset and a prediction question and asked to\ngenerate a response. One interpretation of ICL assumes that the CGM computes\nthe posterior predictive of an unknown Bayesian model, which implicitly defines\na joint distribution over observable datasets and latent mechanisms. This joint\ndistribution factorizes into two components: the model prior over mechanisms\nand the model likelihood of datasets given a mechanism. With this perspective,\nwe define a hallucination as a generated response to the prediction question\nwith low model likelihood given the mechanism. We develop a new method that\ntakes an ICL problem and estimates the probability that a CGM will generate a\nhallucination. Our method only requires generating prediction questions and\nresponses from the CGM and evaluating its response log probability. We\nempirically evaluate our method using large language models for synthetic\nregression and natural language ICL tasks.\n","authors":["Andrew Jesson","Nicolas Beltran-Velez","Quentin Chu","Sweta Karlekar","Jannik Kossen","Yarin Gal","John P. Cunningham","David Blei"],"pdf_url":"https://arxiv.org/pdf/2406.07457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09359v2","updated":"2024-10-31T19:02:17Z","published":"2024-09-14T08:17:30Z","title":"Symbolic Regression with a Learned Concept Library","summary":" We present a novel method for symbolic regression (SR), the task of searching\nfor compact programmatic hypotheses that best explain a dataset. The problem is\ncommonly solved using genetic algorithms; we show that we can enhance such\nmethods by inducing a library of abstract textual concepts. Our algorithm,\ncalled LaSR, uses zero-shot queries to a large language model (LLM) to discover\nand evolve concepts occurring in known high-performing hypotheses. We discover\nnew hypotheses using a mix of standard evolutionary steps and LLM-guided steps\n(obtained through zero-shot LLM queries) conditioned on discovered concepts.\nOnce discovered, hypotheses are used in a new round of concept abstraction and\nevolution. We validate LaSR on the Feynman equations, a popular SR benchmark,\nas well as a set of synthetic tasks. On these benchmarks, LaSR substantially\noutperforms a variety of state-of-the-art SR approaches based on deep learning\nand evolutionary algorithms. Moreover, we show that LaSR can be used to\ndiscover a novel and powerful scaling law for LLMs.\n","authors":["Arya Grayeli","Atharva Sehgal","Omar Costilla-Reyes","Miles Cranmer","Swarat Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2409.09359v2.pdf","comment":"NeurIPS version; 10 pages; no checklist"},{"id":"http://arxiv.org/abs/2407.19389v2","updated":"2024-10-31T18:54:34Z","published":"2024-07-28T04:10:11Z","title":"FIARSE: Model-Heterogeneous Federated Learning via Importance-Aware\n Submodel Extraction","summary":" In federated learning (FL), accommodating clients' varied computational\ncapacities poses a challenge, often limiting the participation of those with\nconstrained resources in global model training. To address this issue, the\nconcept of model heterogeneity through submodel extraction has emerged,\noffering a tailored solution that aligns the model's complexity with each\nclient's computational capacity. In this work, we propose Federated\nImportance-Aware Submodel Extraction (FIARSE), a novel approach that\ndynamically adjusts submodels based on the importance of model parameters,\nthereby overcoming the limitations of previous static and dynamic submodel\nextraction methods. Compared to existing works, the proposed method offers a\ntheoretical foundation for the submodel extraction and eliminates the need for\nadditional information beyond the model parameters themselves to determine\nparameter importance, significantly reducing the overhead on clients. Extensive\nexperiments are conducted on various datasets to showcase the superior\nperformance of the proposed FIARSE.\n","authors":["Feijie Wu","Xingchen Wang","Yaqing Wang","Tianci Liu","Lu Su","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2407.19389v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.03139v2","updated":"2024-10-31T18:48:43Z","published":"2024-04-04T01:24:27Z","title":"Theoretical and Empirical Insights into the Origins of Degree Bias in\n Graph Neural Networks","summary":" Graph Neural Networks (GNNs) often perform better for high-degree nodes than\nlow-degree nodes on node classification tasks. This degree bias can reinforce\nsocial marginalization by, e.g., privileging celebrities and other high-degree\nactors in social networks during social and content recommendation. While\nresearchers have proposed numerous hypotheses for why GNN degree bias occurs,\nwe find via a survey of 38 degree bias papers that these hypotheses are often\nnot rigorously validated, and can even be contradictory. Thus, we provide an\nanalysis of the origins of degree bias in message-passing GNNs with different\ngraph filters. We prove that high-degree test nodes tend to have a lower\nprobability of misclassification regardless of how GNNs are trained. Moreover,\nwe show that degree bias arises from a variety of factors that are associated\nwith a node's degree (e.g., homophily of neighbors, diversity of neighbors).\nFurthermore, we show that during training, some GNNs may adjust their loss on\nlow-degree nodes more slowly than on high-degree nodes; however, with\nsufficiently many epochs of training, message-passing GNNs can achieve their\nmaximum possible training accuracy, which is not significantly limited by their\nexpressive power. Throughout our analysis, we connect our findings to\npreviously-proposed hypotheses for the origins of degree bias, supporting and\nunifying some while drawing doubt to others. We validate our theoretical\nfindings on 8 common real-world networks, and based on our theoretical and\nempirical insights, describe a roadmap to alleviate degree bias.\n","authors":["Arjun Subramonian","Jian Kang","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2404.03139v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.06407v2","updated":"2024-10-31T18:47:52Z","published":"2024-06-10T15:59:08Z","title":"A Taxonomy of Challenges to Curating Fair Datasets","summary":" Despite extensive efforts to create fairer machine learning (ML) datasets,\nthere remains a limited understanding of the practical aspects of dataset\ncuration. Drawing from interviews with 30 ML dataset curators, we present a\ncomprehensive taxonomy of the challenges and trade-offs encountered throughout\nthe dataset curation lifecycle. Our findings underscore overarching issues\nwithin the broader fairness landscape that impact data curation. We conclude\nwith recommendations aimed at fostering systemic changes to better facilitate\nfair dataset curation practices.\n","authors":["Dora Zhao","Morgan Klaus Scheuerman","Pooja Chitre","Jerone T. A. Andrews","Georgia Panagiotidou","Shawn Walker","Kathleen H. Pine","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2406.06407v2.pdf","comment":"NeurIPS Datasets & Benchmarks 2024 (Oral)"},{"id":"http://arxiv.org/abs/2410.13959v2","updated":"2024-10-31T18:38:37Z","published":"2024-10-17T18:34:43Z","title":"FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven\n Question Answering Pipeline","summary":" Financial decision-making hinges on the analysis of relevant information\nembedded in the enormous volume of documents in the financial domain. To\naddress this challenge, we developed FinQAPT, an end-to-end pipeline that\nstreamlines the identification of relevant financial reports based on a query,\nextracts pertinent context, and leverages Large Language Models (LLMs) to\nperform downstream tasks. To evaluate the pipeline, we experimented with\nvarious techniques to optimize the performance of each module using the FinQA\ndataset. We introduced a novel clustering-based negative sampling technique to\nenhance context extraction and a novel prompting method called Dynamic N-shot\nPrompting to boost the numerical question-answering capabilities of LLMs. At\nthe module level, we achieved state-of-the-art accuracy on FinQA, attaining an\naccuracy of 80.6%. However, at the pipeline level, we observed decreased\nperformance due to challenges in extracting relevant context from financial\nreports. We conducted a detailed error analysis of each module and the\nend-to-end pipeline, pinpointing specific challenges that must be addressed to\ndevelop a robust solution for handling complex financial tasks.\n","authors":["Kuldeep Singh","Simerjot Kaur","Charese Smiley"],"pdf_url":"https://arxiv.org/pdf/2410.13959v2.pdf","comment":"Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.02827v4","updated":"2024-10-31T18:23:40Z","published":"2024-02-05T09:24:52Z","title":"PowerGraph: A power grid benchmark dataset for graph neural networks","summary":" Power grids are critical infrastructures of paramount importance to modern\nsociety and, therefore, engineered to operate under diverse conditions and\nfailures. The ongoing energy transition poses new challenges for the\ndecision-makers and system operators. Therefore, developing grid analysis\nalgorithms is important for supporting reliable operations. These key tools\ninclude power flow analysis and system security analysis, both needed for\neffective operational and strategic planning. The literature review shows a\ngrowing trend of machine learning (ML) models that perform these analyses\neffectively. In particular, Graph Neural Networks (GNNs) stand out in such\napplications because of the graph-based structure of power grids. However,\nthere is a lack of publicly available graph datasets for training and\nbenchmarking ML models in electrical power grid applications. First, we present\nPowerGraph, which comprises GNN-tailored datasets for i) power flows, ii)\noptimal power flows, and iii) cascading failure analyses of power grids.\nSecond, we provide ground-truth explanations for the cascading failure\nanalysis. Finally, we perform a complete benchmarking of GNN methods for\nnode-level and graph-level tasks and explainability. Overall, PowerGraph is a\nmultifaceted GNN dataset for diverse tasks that includes power flow and fault\nscenarios with real-world explanations, providing a valuable resource for\ndeveloping improved GNN models for node-level, graph-level tasks and\nexplainability methods in power system modeling. The dataset is available at\nhttps://figshare.com/articles/dataset/PowerGraph/22820534 and the code at\nhttps://github.com/PowerGraph-Datasets.\n","authors":["Anna Varbella","Kenza Amara","Blazhe Gjorgiev","Mennatallah El-Assady","Giovanni Sansavini"],"pdf_url":"https://arxiv.org/pdf/2402.02827v4.pdf","comment":"21 pages, 8 figures, conference paper"},{"id":"http://arxiv.org/abs/2405.19690v3","updated":"2024-10-31T18:09:38Z","published":"2024-05-30T05:04:33Z","title":"Diffusion Policies creating a Trust Region for Offline Reinforcement\n Learning","summary":" Offline reinforcement learning (RL) leverages pre-collected datasets to train\noptimal policies. Diffusion Q-Learning (DQL), introducing diffusion models as a\npowerful and expressive policy class, significantly boosts the performance of\noffline RL. However, its reliance on iterative denoising sampling to generate\nactions slows down both training and inference. While several recent attempts\nhave tried to accelerate diffusion-QL, the improvement in training and/or\ninference speed often results in degraded performance. In this paper, we\nintroduce a dual policy approach, Diffusion Trusted Q-Learning (DTQL), which\ncomprises a diffusion policy for pure behavior cloning and a practical one-step\npolicy. We bridge the two polices by a newly introduced diffusion trust region\nloss. The diffusion policy maintains expressiveness, while the trust region\nloss directs the one-step policy to explore freely and seek modes within the\nregion defined by the diffusion policy. DTQL eliminates the need for iterative\ndenoising sampling during both training and inference, making it remarkably\ncomputationally efficient. We evaluate its effectiveness and algorithmic\ncharacteristics against popular Kullback--Leibler divergence-based distillation\nmethods in 2D bandit scenarios and gym tasks. We then show that DTQL could not\nonly outperform other methods on the majority of the D4RL benchmark tasks but\nalso demonstrate efficiency in training and inference speeds. The PyTorch\nimplementation is available at\nhttps://github.com/TianyuCodings/Diffusion_Trusted_Q_Learning.\n","authors":["Tianyu Chen","Zhendong Wang","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.19690v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.16441v2","updated":"2024-10-31T18:04:11Z","published":"2024-05-26T05:50:39Z","title":"Categorical Flow Matching on Statistical Manifolds","summary":" We introduce Statistical Flow Matching (SFM), a novel and mathematically\nrigorous flow-matching framework on the manifold of parameterized probability\nmeasures inspired by the results from information geometry. We demonstrate the\neffectiveness of our method on the discrete generation problem by instantiating\nSFM on the manifold of categorical distributions whose geometric properties\nremain unexplored in previous discrete generative models. Utilizing the Fisher\ninformation metric, we equip the manifold with a Riemannian structure whose\nintrinsic geometries are effectively leveraged by following the shortest paths\nof geodesics. We develop an efficient training and sampling algorithm that\novercomes numerical stability issues with a diffeomorphism between manifolds.\nOur distinctive geometric perspective of statistical manifolds allows us to\napply optimal transport during training and interpret SFM as following the\nsteepest direction of the natural gradient. Unlike previous models that rely on\nvariational bounds for likelihood estimation, SFM enjoys the exact likelihood\ncalculation for arbitrary probability measures. We manifest that SFM can learn\nmore complex patterns on the statistical manifold where existing models often\nfail due to strong prior assumptions. Comprehensive experiments on real-world\ngenerative tasks ranging from image, text to biological domains further\ndemonstrate that SFM achieves higher sampling quality and likelihood than other\ndiscrete diffusion or flow-based models.\n","authors":["Chaoran Cheng","Jiahan Li","Jian Peng","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2405.16441v2.pdf","comment":"Accepted to NeurIPS 2024 as a conference paper"},{"id":"http://arxiv.org/abs/2410.23277v2","updated":"2024-10-31T18:03:51Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Linjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24222v1","updated":"2024-10-31T17:59:56Z","published":"2024-10-31T17:59:56Z","title":"Robust Gaussian Processes via Relevance Pursuit","summary":" Gaussian processes (GPs) are non-parametric probabilistic regression models\nthat are popular due to their flexibility, data efficiency, and well-calibrated\nuncertainty estimates. However, standard GP models assume homoskedastic\nGaussian noise, while many real-world applications are subject to non-Gaussian\ncorruptions. Variants of GPs that are more robust to alternative noise models\nhave been proposed, and entail significant trade-offs between accuracy and\nrobustness, and between computational requirements and theoretical guarantees.\nIn this work, we propose and study a GP model that achieves robustness against\nsparse outliers by inferring data-point-specific noise levels with a sequential\nselection procedure maximizing the log marginal likelihood that we refer to as\nrelevance pursuit. We show, surprisingly, that the model can be parameterized\nsuch that the associated log marginal likelihood is strongly concave in the\ndata-point-specific noise variances, a property rarely found in either robust\nregression objectives or GP marginal likelihoods. This in turn implies the weak\nsubmodularity of the corresponding subset selection problem, and thereby proves\napproximation guarantees for the proposed algorithm. We compare the model's\nperformance relative to other approaches on diverse regression and Bayesian\noptimization tasks, including the challenging but common setting of sparse\ncorruptions of the labels within or close to the function range.\n","authors":["Sebastian Ament","Elizabeth Santorella","David Eriksson","Ben Letham","Maximilian Balandat","Eytan Bakshy"],"pdf_url":"https://arxiv.org/pdf/2410.24222v1.pdf","comment":"NeurIPS 2024 Article"},{"id":"http://arxiv.org/abs/2410.24220v1","updated":"2024-10-31T17:59:53Z","published":"2024-10-31T17:59:53Z","title":"Bridging Geometric States via Geometric Diffusion Bridge","summary":" The accurate prediction of geometric state evolution in complex systems is\ncritical for advancing scientific domains such as quantum chemistry and\nmaterial modeling. Traditional experimental and computational methods face\nchallenges in terms of environmental constraints and computational demands,\nwhile current deep learning approaches still fall short in terms of precision\nand generality. In this work, we introduce the Geometric Diffusion Bridge\n(GDB), a novel generative modeling framework that accurately bridges initial\nand target geometric states. GDB leverages a probabilistic approach to evolve\ngeometric state distributions, employing an equivariant diffusion bridge\nderived by a modified version of Doob's $h$-transform for connecting geometric\nstates. This tailored diffusion process is anchored by initial and target\ngeometric states as fixed endpoints and governed by equivariant transition\nkernels. Moreover, trajectory data can be seamlessly leveraged in our GDB\nframework by using a chain of equivariant diffusion bridges, providing a more\ndetailed and accurate characterization of evolution dynamics. Theoretically, we\nconduct a thorough examination to confirm our framework's ability to preserve\njoint distributions of geometric states and capability to completely model the\nunderlying dynamics inducing trajectory distributions with negligible error.\nExperimental evaluations across various real-world scenarios show that GDB\nsurpasses existing state-of-the-art approaches, opening up a new pathway for\naccurately bridging geometric states and tackling crucial scientific challenges\nwith improved accuracy and applicability.\n","authors":["Shengjie Luo","Yixian Xu","Di He","Shuxin Zheng","Tie-Yan Liu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24220v1.pdf","comment":"33 pages, 5 tables; NeurIPS 2024 Camera Ready version"},{"id":"http://arxiv.org/abs/2410.24218v1","updated":"2024-10-31T17:59:52Z","published":"2024-10-31T17:59:52Z","title":"Teaching Embodied Reinforcement Learning Agents: Informativeness and\n Diversity of Language Use","summary":" In real-world scenarios, it is desirable for embodied agents to have the\nability to leverage human language to gain explicit or implicit knowledge for\nlearning tasks. Despite recent progress, most previous approaches adopt simple\nlow-level instructions as language inputs, which may not reflect natural human\ncommunication. It's not clear how to incorporate rich language use to\nfacilitate task learning. To address this question, this paper studies\ndifferent types of language inputs in facilitating reinforcement learning (RL)\nembodied agents. More specifically, we examine how different levels of language\ninformativeness (i.e., feedback on past behaviors and future guidance) and\ndiversity (i.e., variation of language expressions) impact agent learning and\ninference. Our empirical results based on four RL benchmarks demonstrate that\nagents trained with diverse and informative language feedback can achieve\nenhanced generalization and fast adaptation to new tasks. These findings\nhighlight the pivotal role of language use in teaching embodied agents new\ntasks in an open world. Project website:\nhttps://github.com/sled-group/Teachable_RL\n","authors":["Jiajun Xi","Yinong He","Jianing Yang","Yinpei Dai","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2410.24218v1.pdf","comment":"EMNLP 2024 Main. Project website:\n https://github.com/sled-group/Teachable_RL"},{"id":"http://arxiv.org/abs/2410.24216v1","updated":"2024-10-31T17:59:46Z","published":"2024-10-31T17:59:46Z","title":"CaAdam: Improving Adam optimizer using connection aware methods","summary":" We introduce a new method inspired by Adam that enhances convergence speed\nand achieves better loss function minima. Traditional optimizers, including\nAdam, apply uniform or globally adjusted learning rates across neural networks\nwithout considering their architectural specifics. This architecture-agnostic\napproach is deeply embedded in most deep learning frameworks, where optimizers\nare implemented as standalone modules without direct access to the network's\nstructural information. For instance, in popular frameworks like Keras or\nPyTorch, optimizers operate solely on gradients and parameters, without\nknowledge of layer connectivity or network topology. Our algorithm, CaAdam,\nexplores this overlooked area by introducing connection-aware optimization\nthrough carefully designed proxies of architectural information. We propose\nmultiple scaling methodologies that dynamically adjust learning rates based on\neasily accessible structural properties such as layer depth, connection counts,\nand gradient distributions. This approach enables more granular optimization\nwhile working within the constraints of current deep learning frameworks.\nEmpirical evaluations on standard datasets (e.g., CIFAR-10, Fashion MNIST) show\nthat our method consistently achieves faster convergence and higher accuracy\ncompared to standard Adam optimizer, demonstrating the potential benefits of\nincorporating architectural awareness in optimization strategies.\n","authors":["Remi Genet","Hugo Inzirillo"],"pdf_url":"https://arxiv.org/pdf/2410.24216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24214v1","updated":"2024-10-31T17:59:37Z","published":"2024-10-31T17:59:37Z","title":"ARQ: A Mixed-Precision Quantization Framework for Accurate and\n Certifiably Robust DNNs","summary":" Mixed precision quantization has become an important technique for enabling\nthe execution of deep neural networks (DNNs) on limited resource computing\nplatforms. Traditional quantization methods have primarily concentrated on\nmaintaining neural network accuracy, either ignoring the impact of quantization\non the robustness of the network, or using only empirical techniques for\nimproving robustness. In contrast, techniques for robustness certification,\nwhich can provide strong guarantees about the robustness of DNNs have not been\nused during quantization due to their high computation cost.\n This paper introduces ARQ, an innovative mixed-precision quantization method\nthat not only preserves the clean accuracy of the smoothed classifiers but also\nmaintains their certified robustness. ARQ uses reinforcement learning to find\naccurate and robust DNN quantization, while efficiently leveraging randomized\nsmoothing, a popular class of statistical DNN verification algorithms, to guide\nthe search process.\n We compare ARQ with multiple state-of-the-art quantization techniques on\nseveral DNN architectures commonly used in quantization studies: ResNet-20 on\nCIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate\nthat ARQ consistently performs better than these baselines across all the\nbenchmarks and the input perturbation levels. In many cases, the performance of\nARQ quantized networks can reach that of the original DNN with floating-point\nweights, but with only 1.5% instructions.\n","authors":["Yuchen Yang","Shubham Ugare","Yifan Zhao","Gagandeep Singh","Sasa Misailovic"],"pdf_url":"https://arxiv.org/pdf/2410.24214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24210v1","updated":"2024-10-31T17:58:41Z","published":"2024-10-31T17:58:41Z","title":"TabM: Advancing Tabular Deep Learning with Parameter-Efficient\n Ensembling","summary":" Deep learning architectures for supervised learning on tabular data range\nfrom simple multilayer perceptrons (MLP) to sophisticated Transformers and\nretrieval-augmented methods. This study highlights a major, yet so far\noverlooked opportunity for substantially improving tabular MLPs: namely,\nparameter-efficient ensembling -- a paradigm for implementing an ensemble of\nmodels as one model producing multiple predictions. We start by developing TabM\n-- a simple model based on MLP and our variations of BatchEnsemble (an existing\ntechnique). Then, we perform a large-scale evaluation of tabular DL\narchitectures on public benchmarks in terms of both task performance and\nefficiency, which renders the landscape of tabular DL in a new light.\nGenerally, we show that MLPs, including TabM, form a line of stronger and more\npractical models compared to attention- and retrieval-based architectures. In\nparticular, we find that TabM demonstrates the best performance among tabular\nDL models. Lastly, we conduct an empirical analysis on the ensemble-like nature\nof TabM. For example, we observe that the multiple predictions of TabM are weak\nindividually, but powerful collectively. Overall, our work brings an impactful\ntechnique to tabular DL, analyses its behaviour, and advances the\nperformance-efficiency trade-off with TabM -- a simple and powerful baseline\nfor researchers and practitioners.\n","authors":["Yury Gorishniy","Akim Kotelnikov","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2410.24210v1.pdf","comment":"Code: https://github.com/yandex-research/tabm"},{"id":"http://arxiv.org/abs/2406.15349v2","updated":"2024-10-31T17:58:34Z","published":"2024-06-21T17:59:02Z","title":"NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and\n Benchmarking","summary":" Benchmarking vision-based driving policies is challenging. On one hand,\nopen-loop evaluation with real data is easy, but these results do not reflect\nclosed-loop performance. On the other, closed-loop evaluation is possible in\nsimulation, but is hard to scale due to its significant computational demands.\nFurther, the simulators available today exhibit a large domain gap to real\ndata. This has resulted in an inability to draw clear conclusions from the\nrapidly growing body of research on end-to-end autonomous driving. In this\npaper, we present NAVSIM, a middle ground between these evaluation paradigms,\nwhere we use large datasets in combination with a non-reactive simulator to\nenable large-scale real-world benchmarking. Specifically, we gather\nsimulation-based metrics, such as progress and time to collision, by unrolling\nbird's eye view abstractions of the test scenes for a short simulation horizon.\nOur simulation is non-reactive, i.e., the evaluated policy and environment do\nnot influence each other. As we demonstrate empirically, this decoupling allows\nopen-loop metric computation while being better aligned with closed-loop\nevaluations than traditional displacement errors. NAVSIM enabled a new\ncompetition held at CVPR 2024, where 143 teams submitted 463 entries, resulting\nin several new insights. On a large set of challenging scenarios, we observe\nthat simple methods with moderate compute requirements such as TransFuser can\nmatch recent large-scale end-to-end driving architectures such as UniAD. Our\nmodular framework can potentially be extended with new datasets, data curation\nstrategies, and metrics, and will be continually maintained to host future\nchallenges. Our code is available at\nhttps://github.com/autonomousvision/navsim.\n","authors":["Daniel Dauner","Marcel Hallgarten","Tianyu Li","Xinshuo Weng","Zhiyu Huang","Zetong Yang","Hongyang Li","Igor Gilitschenski","Boris Ivanovic","Marco Pavone","Andreas Geiger","Kashyap Chitta"],"pdf_url":"https://arxiv.org/pdf/2406.15349v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2410.24206v1","updated":"2024-10-31T17:58:13Z","published":"2024-10-31T17:58:13Z","title":"Understanding Optimization in Deep Learning with Central Flows","summary":" Optimization in deep learning remains poorly understood, even in the simple\nsetting of deterministic (i.e. full-batch) training. A key difficulty is that\nmuch of an optimizer's behavior is implicitly determined by complex oscillatory\ndynamics, referred to as the \"edge of stability.\" The main contribution of this\npaper is to show that an optimizer's implicit behavior can be explicitly\ncaptured by a \"central flow:\" a differential equation which models the\ntime-averaged optimization trajectory. We show that these flows can empirically\npredict long-term optimization trajectories of generic neural networks with a\nhigh degree of numerical accuracy. By interpreting these flows, we reveal for\nthe first time 1) the precise sense in which RMSProp adapts to the local loss\nlandscape, and 2) an \"acceleration via regularization\" mechanism, wherein\nadaptive optimizers implicitly navigate towards low-curvature regions in which\nthey can take larger steps. This mechanism is key to the efficacy of these\nadaptive optimizers. Overall, we believe that central flows constitute a\npromising tool for reasoning about optimization in deep learning.\n","authors":["Jeremy M. Cohen","Alex Damian","Ameet Talwalkar","Zico Kolter","Jason D. Lee"],"pdf_url":"https://arxiv.org/pdf/2410.24206v1.pdf","comment":"first two authors contributed equally; author order determined by\n coin flip"},{"id":"http://arxiv.org/abs/2410.24185v1","updated":"2024-10-31T17:48:45Z","published":"2024-10-31T17:48:45Z","title":"DexMimicGen: Automated Data Generation for Bimanual Dexterous\n Manipulation via Imitation Learning","summary":" Imitation learning from human demonstrations is an effective means to teach\nrobots manipulation skills. But data acquisition is a major bottleneck in\napplying this paradigm more broadly, due to the amount of cost and human effort\ninvolved. There has been significant interest in imitation learning for\nbimanual dexterous robots, like humanoids. Unfortunately, data collection is\neven more challenging here due to the challenges of simultaneously controlling\nmultiple arms and multi-fingered hands. Automated data generation in simulation\nis a compelling, scalable alternative to fuel this need for data. To this end,\nwe introduce DexMimicGen, a large-scale automated data generation system that\nsynthesizes trajectories from a handful of human demonstrations for humanoid\nrobots with dexterous hands. We present a collection of simulation environments\nin the setting of bimanual dexterous manipulation, spanning a range of\nmanipulation behaviors and different requirements for coordination among the\ntwo arms. We generate 21K demos across these tasks from just 60 source human\ndemos and study the effect of several data generation and policy learning\ndecisions on agent performance. Finally, we present a real-to-sim-to-real\npipeline and deploy it on a real-world humanoid can sorting task. Videos and\nmore are at https://dexmimicgen.github.io/\n","authors":["Zhenyu Jiang","Yuqi Xie","Kevin Lin","Zhenjia Xu","Weikang Wan","Ajay Mandlekar","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.24185v1.pdf","comment":"Project website: https://dexmimicgen.github.io/"},{"id":"http://arxiv.org/abs/2410.24178v1","updated":"2024-10-31T17:43:53Z","published":"2024-10-31T17:43:53Z","title":"AR-Pro: Counterfactual Explanations for Anomaly Repair with Formal\n Properties","summary":" Anomaly detection is widely used for identifying critical errors and\nsuspicious behaviors, but current methods lack interpretability. We leverage\ncommon properties of existing methods and recent advances in generative models\nto introduce counterfactual explanations for anomaly detection. Given an input,\nwe generate its counterfactual as a diffusion-based repair that shows what a\nnon-anomalous version should have looked like. A key advantage of this approach\nis that it enables a domain-independent formal specification of explainability\ndesiderata, offering a unified framework for generating and evaluating\nexplanations. We demonstrate the effectiveness of our anomaly explainability\nframework, AR-Pro, on vision (MVTec, VisA) and time-series (SWaT, WADI, HAI)\nanomaly datasets. The code used for the experiments is accessible at:\nhttps://github.com/xjiae/arpro.\n","authors":["Xiayan Ji","Anton Xue","Eric Wong","Oleg Sokolsky","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2410.24178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24177v1","updated":"2024-10-31T17:43:13Z","published":"2024-10-31T17:43:13Z","title":"DC-Spin: A Speaker-invariant Speech Tokenizer for Spoken Language Models","summary":" Spoken language models (SLMs) have gained increasing attention with\nadvancements in text-based, decoder-only language models. SLMs process text and\nspeech, enabling simultaneous speech understanding and generation. This paper\npresents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to\nimprove speech tokenization by bridging audio signals and SLM tokens. DC-Spin\nextracts speaker-invariant tokens rich in phonetic information and resilient to\ninput variations, enhancing zero-shot SLM tasks and speech resynthesis. We\npropose a chunk-wise approach to enable streamable DC-Spin without retraining\nand degradation. Comparisons of tokenization methods (self-supervised and\nneural audio codecs), model scalability, and downstream task proxies show that\ntokens easily modeled by an n-gram LM or aligned with phonemes offer strong\nperformance, providing insights for designing speech tokenizers for SLMs.\n","authors":["Heng-Jui Chang","Hongyu Gong","Changhan Wang","James Glass","Yu-An Chung"],"pdf_url":"https://arxiv.org/pdf/2410.24177v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.24169v1","updated":"2024-10-31T17:35:57Z","published":"2024-10-31T17:35:57Z","title":"The Importance of Being Scalable: Improving the Speed and Accuracy of\n Neural Network Interatomic Potentials Across Chemical Domains","summary":" Scaling has been critical in improving model performance and generalization\nin machine learning. It involves how a model's performance changes with\nincreases in model size or input data, as well as how efficiently computational\nresources are utilized to support this growth. Despite successes in other\nareas, the study of scaling in Neural Network Interatomic Potentials (NNIPs)\nremains limited. NNIPs act as surrogate models for ab initio quantum mechanical\ncalculations. The dominant paradigm here is to incorporate many physical domain\nconstraints into the model, such as rotational equivariance. We contend that\nthese complex constraints inhibit the scaling ability of NNIPs, and are likely\nto lead to performance plateaus in the long run. In this work, we take an\nalternative approach and start by systematically studying NNIP scaling\nstrategies. Our findings indicate that scaling the model through attention\nmechanisms is efficient and improves model expressivity. These insights\nmotivate us to develop an NNIP architecture designed for scalability: the\nEfficiently Scaled Attention Interatomic Potential (EScAIP). EScAIP leverages a\nmulti-head self-attention formulation within graph neural networks, applying\nattention at the neighbor-level representations. Implemented with\nhighly-optimized attention GPU kernels, EScAIP achieves substantial gains in\nefficiency--at least 10x faster inference, 5x less memory usage--compared to\nexisting NNIPs. EScAIP also achieves state-of-the-art performance on a wide\nrange of datasets including catalysts (OC20 and OC22), molecules (SPICE), and\nmaterials (MPTrj). We emphasize that our approach should be thought of as a\nphilosophy rather than a specific model, representing a proof-of-concept for\ndeveloping general-purpose NNIPs that achieve better expressivity through\nscaling, and continue to scale efficiently with increased computational\nresources and training data.\n","authors":["Eric Qu","Aditi S. Krishnapriyan"],"pdf_url":"https://arxiv.org/pdf/2410.24169v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.04690v3","updated":"2024-10-31T17:32:26Z","published":"2024-03-07T17:35:58Z","title":"Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self\n Attention at the Threadblock Level","summary":" Neighborhood attention reduces the cost of self attention by restricting each\ntoken's attention span to its nearest neighbors. This restriction,\nparameterized by a window size and dilation factor, draws a spectrum of\npossible attention patterns between linear projection and self attention.\nNeighborhood attention, and more generally sliding window attention patterns,\nhave long been bounded by infrastructure, particularly in higher-rank spaces\n(2-D and 3-D), calling for the development of custom kernels, which have been\nlimited in either functionality, or performance, if not both. In this work, we\naim to massively improve upon existing infrastructure by providing two new\nmethods for implementing neighborhood attention. We first show that\nneighborhood attention can be represented as a batched GEMM problem, similar to\nstandard attention, and implement it for 1-D and 2-D neighborhood attention.\nThese kernels on average provide 895% and 272% improvement in full precision\nruntime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood\nattention respectively. We find that aside from being heavily bound by memory\nbandwidth, certain inherent inefficiencies exist in all unfused implementations\nof neighborhood attention, which in most cases undo their theoretical\nefficiency gain. Motivated by the progress made into fused dot-product\nattention kernels, we developed fused neighborhood attention; an adaptation of\nfused dot-product attention kernels that allow fine-grained control over\nattention across different spatial axes. Known for reducing the quadratic time\ncomplexity of self attention to a linear complexity, neighborhood attention can\nnow enjoy a reduced and constant memory footprint, and record-breaking half\nprecision runtime. We observe that our fused implementation successfully\ncircumvents some of the unavoidable inefficiencies in unfused\nimplementations...\n","authors":["Ali Hassani","Wen-Mei Hwu","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2403.04690v3.pdf","comment":"To appear in 38th Conference on Neural Information Processing Systems\n (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2312.17640v3","updated":"2024-10-31T17:29:42Z","published":"2023-12-29T15:05:00Z","title":"Decision-focused predictions via pessimistic bilevel optimization: a\n computational study","summary":" Dealing with uncertainty in optimization parameters is an important and\nlongstanding challenge. Typically, uncertain parameters are predicted\naccurately, and then a deterministic optimization problem is solved. However,\nthe decisions produced by this so-called \\emph{predict-then-optimize} procedure\ncan be highly sensitive to uncertain parameters. In this work, we contribute to\nrecent efforts in producing \\emph{decision-focused} predictions, i.e., to build\npredictive models that are constructed with the goal of minimizing a\n\\emph{regret} measure on the decisions taken with them. We begin by formulating\nthe exact expected regret minimization as a pessimistic bilevel optimization\nmodel. Then, we establish NP-completeness of this problem, even in a heavily\nrestricted case. Using duality arguments, we reformulate it as a non-convex\nquadratic optimization problem. Finally, we show various computational\ntechniques to achieve tractability. We report extensive computational results\non shortest-path instances with uncertain cost vectors. Our results indicate\nthat our approach can improve training performance over the approach of\nElmachtoub and Grigas (2022), a state-of-the-art method for decision-focused\nlearning.\n","authors":["Víctor Bucarey","Sophia Calderón","Gonzalo Muñoz","Frederic Semet"],"pdf_url":"https://arxiv.org/pdf/2312.17640v3.pdf","comment":"We state in this version that: \"To the best of our knowledge, no\n hardness result for computing a regret-minimizing linear regression in this\n context is known\". However, in Elmachtoub and Grigas 2022, they show that\n this is clearly a generalization of the 0-1 loss, that is NP-hard"},{"id":"http://arxiv.org/abs/2406.10796v2","updated":"2024-10-31T17:29:37Z","published":"2024-06-16T03:45:03Z","title":"Ab Initio Structure Solutions from Nanocrystalline Powder Diffraction\n Data","summary":" A major challenge in materials science is the determination of the structure\nof nanometer sized objects. Here we present a novel approach that uses a\ngenerative machine learning model based on diffusion processes that is trained\non 45,229 known structures. The model factors both the measured diffraction\npattern as well as relevant statistical priors on the unit cell of atomic\ncluster structures. Conditioned only on the chemical formula and the\ninformation-scarce finite-size broadened powder diffraction pattern, we find\nthat our model, PXRDnet, can successfully solve simulated nanocrystals as small\nas 10 angstroms across 200 materials of varying symmetry and complexity,\nincluding structures from all seven crystal systems. We show that our model can\nsuccessfully and verifiably determine structural candidates four out of five\ntimes, with average error among these candidates being only 7% (as measured by\npost-Rietveld refinement R-factor). Furthermore, PXRDnet is capable of solving\nstructures from noisy diffraction patterns gathered in real-world experiments.\nWe suggest that data driven approaches, bootstrapped from theoretical\nsimulation, will ultimately provide a path towards determining the structure of\npreviously unsolved nano-materials.\n","authors":["Gabe Guo","Tristan Saidi","Maxwell Terban","Michele Valsecchi","Simon JL Billinge","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2406.10796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24166v1","updated":"2024-10-31T17:28:41Z","published":"2024-10-31T17:28:41Z","title":"Approaches to human activity recognition via passive radar","summary":" The thesis explores novel methods for Human Activity Recognition (HAR) using\npassive radar with a focus on non-intrusive Wi-Fi Channel State Information\n(CSI) data. Traditional HAR approaches often use invasive sensors like cameras\nor wearables, raising privacy issues. This study leverages the non-intrusive\nnature of CSI, using Spiking Neural Networks (SNN) to interpret signal\nvariations caused by human movements. These networks, integrated with symbolic\nreasoning frameworks such as DeepProbLog, enhance the adaptability and\ninterpretability of HAR systems. SNNs offer reduced power consumption, ideal\nfor privacy-sensitive applications. Experimental results demonstrate SNN-based\nneurosymbolic models achieve high accuracy making them a promising alternative\nfor HAR across various domains.\n","authors":["Christian Bresciani","Federico Cerutti","Marco Cominelli"],"pdf_url":"https://arxiv.org/pdf/2410.24166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24164v1","updated":"2024-10-31T17:22:30Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v1.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2410.24162v1","updated":"2024-10-31T17:20:13Z","published":"2024-10-31T17:20:13Z","title":"Conformalized Prediction of Post-Fault Voltage Trajectories Using\n Pre-trained and Finetuned Attention-Driven Neural Operators","summary":" This paper proposes a new data-driven methodology for predicting intervals of\npost-fault voltage trajectories in power systems. We begin by introducing the\nQuantile Attention-Fourier Deep Operator Network (QAF-DeepONet), designed to\ncapture the complex dynamics of voltage trajectories and reliably estimate\nquantiles of the target trajectory without any distributional assumptions. The\nproposed operator regression model maps the observed portion of the voltage\ntrajectory to its unobserved post-fault trajectory. Our methodology employs a\npre-training and fine-tuning process to address the challenge of limited data\navailability. To ensure data privacy in learning the pre-trained model, we use\nmerging via federated learning with data from neighboring buses, enabling the\nmodel to learn the underlying voltage dynamics from such buses without directly\nsharing their data. After pre-training, we fine-tune the model with data from\nthe target bus, allowing it to adapt to unique dynamics and operating\nconditions. Finally, we integrate conformal prediction into the fine-tuned\nmodel to ensure coverage guarantees for the predicted intervals. We evaluated\nthe performance of the proposed methodology using the New England 39-bus test\nsystem considering detailed models of voltage and frequency controllers. Two\nmetrics, Prediction Interval Coverage Probability (PICP) and Prediction\nInterval Normalized Average Width (PINAW), are used to numerically assess the\nmodel's performance in predicting intervals. The results show that the proposed\napproach offers practical and reliable uncertainty quantification in predicting\nthe interval of post-fault voltage trajectories.\n","authors":["Amirhossein Mollaali","Gabriel Zufferey","Gonzalo Constante-Flores","Christian Moya","Can Li","Guang Lin","Meng Yue"],"pdf_url":"https://arxiv.org/pdf/2410.24162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12747v2","updated":"2024-10-31T17:18:16Z","published":"2024-06-18T16:07:33Z","title":"TSI-Bench: Benchmarking Time Series Imputation","summary":" Effective imputation is a crucial preprocessing step for time series\nanalysis. Despite the development of numerous deep learning algorithms for time\nseries imputation, the community lacks standardized and comprehensive benchmark\nplatforms to effectively evaluate imputation performance across different\nsettings. Moreover, although many deep learning forecasting algorithms have\ndemonstrated excellent performance, whether their modelling achievements can be\ntransferred to time series imputation tasks remains unexplored. To bridge these\ngaps, we develop TSI-Bench, the first (to our knowledge) comprehensive\nbenchmark suite for time series imputation utilizing deep learning techniques.\nThe TSI-Bench pipeline standardizes experimental settings to enable fair\nevaluation of imputation algorithms and identification of meaningful insights\ninto the influence of domain-appropriate missing rates and patterns on model\nperformance. Furthermore, TSI-Bench innovatively provides a systematic paradigm\nto tailor time series forecasting algorithms for imputation purposes. Our\nextensive study across 34,804 experiments, 28 algorithms, and 8 datasets with\ndiverse missingness scenarios demonstrates TSI-Bench's effectiveness in diverse\ndownstream tasks and potential to unlock future directions in time series\nimputation research and analysis. All source code and experiment logs are\nreleased at https://github.com/WenjieDu/AwesomeImputation.\n","authors":["Wenjie Du","Jun Wang","Linglong Qian","Yiyuan Yang","Zina Ibrahim","Fanxing Liu","Zepu Wang","Haoxin Liu","Zhiyuan Zhao","Yingjie Zhou","Wenjia Wang","Kaize Ding","Yuxuan Liang","B. Aditya Prakash","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2406.12747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22382v2","updated":"2024-10-31T17:12:27Z","published":"2024-10-29T12:54:55Z","title":"Debiasing Alternative Data for Credit Underwriting Using Causal\n Inference","summary":" Alternative data provides valuable insights for lenders to evaluate a\nborrower's creditworthiness, which could help expand credit access to\nunderserved groups and lower costs for borrowers. But some forms of alternative\ndata have historically been excluded from credit underwriting because it could\nact as an illegal proxy for a protected class like race or gender, causing\nredlining. We propose a method for applying causal inference to a supervised\nmachine learning model to debias alternative data so that it might be used for\ncredit underwriting. We demonstrate how our algorithm can be used against a\npublic credit dataset to improve model accuracy across different racial groups,\nwhile providing theoretically robust nondiscrimination guarantees.\n","authors":["Chris Lam"],"pdf_url":"https://arxiv.org/pdf/2410.22382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24153v1","updated":"2024-10-31T17:10:57Z","published":"2024-10-31T17:10:57Z","title":"Dense Associative Memory Through the Lens of Random Features","summary":" Dense Associative Memories are high storage capacity variants of the Hopfield\nnetworks that are capable of storing a large number of memory patterns in the\nweights of the network of a given size. Their common formulations typically\nrequire storing each pattern in a separate set of synaptic weights, which leads\nto the increase of the number of synaptic weights when new patterns are\nintroduced. In this work we propose an alternative formulation of this class of\nmodels using random features, commonly used in kernel methods. In this\nformulation the number of network's parameters remains fixed. At the same time,\nnew memories can be added to the network by modifying existing weights. We show\nthat this novel network closely approximates the energy function and dynamics\nof conventional Dense Associative Memories and shares their desirable\ncomputational properties.\n","authors":["Benjamin Hoover","Duen Horng Chau","Hendrik Strobelt","Parikshit Ram","Dmitry Krotov"],"pdf_url":"https://arxiv.org/pdf/2410.24153v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24145v1","updated":"2024-10-31T17:05:52Z","published":"2024-10-31T17:05:52Z","title":"Conformal prediction of circular data","summary":" Split conformal prediction techniques are applied to regression problems with\ncircular responses by introducing a suitable conformity score, leading to\nprediction sets with adaptive arc length and finite-sample coverage guarantees\nfor any circular predictive model under exchangeable data. Leveraging the high\nperformance of existing predictive models designed for linear responses, we\nanalyze a general projection procedure that converts any linear response\nregression model into one suitable for circular responses. When random forests\nserve as basis models in this projection procedure, we harness the out-of-bag\ndynamics to eliminate the necessity for a separate calibration sample in the\nconstruction of prediction sets. For synthetic and real datasets the resulting\nprojected random forests model produces more efficient out-of-bag conformal\nprediction sets, with shorter median arc length, when compared to the split\nconformal prediction sets generated by two existing alternative models.\n","authors":["Paulo C. Marques F.","Rinaldo Artes","Helton Graziadei"],"pdf_url":"https://arxiv.org/pdf/2410.24145v1.pdf","comment":"7 pages; 4 figures"},{"id":"http://arxiv.org/abs/2402.07963v3","updated":"2024-10-31T17:05:49Z","published":"2024-02-12T10:32:47Z","title":"SPO: Sequential Monte Carlo Policy Optimisation","summary":" Leveraging planning during learning and decision-making is central to the\nlong-term development of intelligent agents. Recent works have successfully\ncombined tree-based search methods and self-play learning mechanisms to this\nend. However, these methods typically face scaling challenges due to the\nsequential nature of their search. While practical engineering solutions can\npartly overcome this, they often result in a negative impact on performance. In\nthis paper, we introduce SPO: Sequential Monte Carlo Policy Optimisation, a\nmodel-based reinforcement learning algorithm grounded within the Expectation\nMaximisation (EM) framework. We show that SPO provides robust policy\nimprovement and efficient scaling properties. The sample-based search makes it\ndirectly applicable to both discrete and continuous action spaces without\nmodifications. We demonstrate statistically significant improvements in\nperformance relative to model-free and model-based baselines across both\ncontinuous and discrete environments. Furthermore, the parallel nature of SPO's\nsearch enables effective utilisation of hardware accelerators, yielding\nfavourable scaling laws.\n","authors":["Matthew V Macfarlane","Edan Toledo","Donal Byrne","Paul Duckworth","Alexandre Laterre"],"pdf_url":"https://arxiv.org/pdf/2402.07963v3.pdf","comment":"Accepted to NeurIPS 2024. 34 pages, 3 main figures"},{"id":"http://arxiv.org/abs/2402.18551v2","updated":"2024-10-31T17:01:45Z","published":"2024-02-28T18:34:53Z","title":"Implicit Optimization Bias of Next-Token Prediction in Linear Models","summary":" We initiate an investigation into the optimization properties of next-token\nprediction (NTP), the dominant training paradigm for modern language models.\nSpecifically, we study the structural properties of the solutions selected by\ngradient-based optimizers among the many possible minimizers of the NTP\nobjective. By framing NTP as cross-entropy minimization across distinct\ncontexts, each tied with a sparse conditional probability distribution across a\nfinite vocabulary of tokens, we introduce \"NTP-separability conditions\" that\nenable reaching the data-entropy lower bound. With this setup, and focusing on\nlinear models with fixed context embeddings, we characterize the optimization\nbias of gradient descent (GD): Within the data subspace defined by the sparsity\npatterns of distinct contexts, GD selects parameters that equate the logits'\ndifferences of in-support tokens to their log-odds. In the orthogonal subspace,\nthe GD parameters diverge in norm and select the direction that maximizes a\nmargin specific to NTP. These findings extend previous research on implicit\nbias in one-hot classification to the NTP setting, highlighting key differences\nand prompting further research into the optimization and generalization\nproperties of NTP, irrespective of the specific architecture used to generate\nthe context embeddings.\n","authors":["Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2402.18551v2.pdf","comment":"v2: fixed typos and writing in various parts; updated figures and\n future-work section"},{"id":"http://arxiv.org/abs/2407.01079v3","updated":"2024-10-31T16:59:13Z","published":"2024-07-01T08:34:40Z","title":"On Statistical Rates and Provably Efficient Criteria of Latent Diffusion\n Transformers (DiTs)","summary":" We investigate the statistical and computational limits of latent Diffusion\nTransformers (DiTs) under the low-dimensional linear latent space assumption.\nStatistically, we study the universal approximation and sample complexity of\nthe DiTs score function, as well as the distribution recovery property of the\ninitial data. Specifically, under mild data assumptions, we derive an\napproximation error bound for the score network of latent DiTs, which is\nsub-linear in the latent space dimension. Additionally, we derive the\ncorresponding sample complexity bound and show that the data distribution\ngenerated from the estimated score function converges toward a proximate area\nof the original one. Computationally, we characterize the hardness of both\nforward inference and backward computation of latent DiTs, assuming the Strong\nExponential Time Hypothesis (SETH). For forward inference, we identify\nefficient criteria for all possible latent DiTs inference algorithms and\nshowcase our theory by pushing the efficiency toward almost-linear time\ninference. For backward computation, we leverage the low-rank structure within\nthe gradient computation of DiTs training for possible algorithmic speedup.\nSpecifically, we show that such speedup achieves almost-linear time latent DiTs\ntraining by casting the DiTs gradient as a series of chained low-rank\napproximations with bounded error. Under the low-dimensional assumption, we\nshow that the statistical rates and the computational efficiency are all\ndominated by the dimension of the subspace, suggesting that latent DiTs have\nthe potential to bypass the challenges associated with the high dimensionality\nof initial data.\n","authors":["Jerry Yao-Chieh Hu","Weimin Wu","Zhao Song","Han Liu"],"pdf_url":"https://arxiv.org/pdf/2407.01079v3.pdf","comment":"Accepted at NeurIPS 2024. v3 updated to camera-ready version with\n many typos fixed; v2 fixed typos, added Fig. 1 and added clarifications"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.23883v1","updated":"2024-10-31T12:45:54Z","published":"2024-10-31T12:45:54Z","title":"'No' Matters: Out-of-Distribution Detection in Multimodality Long\n Dialogue","summary":" Out-of-distribution (OOD) detection in multimodal contexts is essential for\nidentifying deviations in combined inputs from different modalities,\nparticularly in applications like open-domain dialogue systems or real-life\ndialogue interactions. This paper aims to improve the user experience that\ninvolves multi-round long dialogues by efficiently detecting OOD dialogues and\nimages. We introduce a novel scoring framework named Dialogue Image Aligning\nand Enhancing Framework (DIAEF) that integrates the visual language models with\nthe novel proposed scores that detect OOD in two key scenarios (1) mismatches\nbetween the dialogue and image input pair and (2) input pairs with previously\nunseen labels. Our experimental results, derived from various benchmarks,\ndemonstrate that integrating image and multi-round dialogue OOD detection is\nmore effective with previously unseen labels than using either modality\nindependently. In the presence of mismatched pairs, our proposed score\neffectively identifies these mismatches and demonstrates strong robustness in\nlong dialogues. This approach enhances domain-aware, adaptive conversational\nagents and establishes baselines for future studies.\n","authors":["Rena Gao","Xuetong Wu","Siwen Luo","Caren Han","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23883v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.23861v1","updated":"2024-10-31T12:11:17Z","published":"2024-10-31T12:11:17Z","title":"Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models","summary":" Large Multimodal Models (LMMs) have demonstrated the ability to interact with\nhumans under real-world conditions by combining Large Language Models (LLMs)\nand modality encoders to align multimodal information (visual and auditory)\nwith text. However, such models raise new safety challenges of whether models\nthat are safety-aligned on text also exhibit consistent safeguards for\nmultimodal inputs. Despite recent safety-alignment research on vision LMMs, the\nsafety of audio LMMs remains under-explored. In this work, we comprehensively\nred team the safety of five advanced audio LMMs under three settings: (i)\nharmful questions in both audio and text formats, (ii) harmful questions in\ntext format accompanied by distracting non-speech audio, and (iii)\nspeech-specific jailbreaks. Our results under these settings demonstrate that\nopen-source audio LMMs suffer an average attack success rate of 69.14% on\nharmful audio questions, and exhibit safety vulnerabilities when distracted\nwith non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro\nachieve an attack success rate of 70.67% on the harmful query benchmark. We\nprovide insights on what could cause these reported safety-misalignments.\nWarning: this paper contains offensive examples.\n","authors":["Hao Yang","Lizhen Qu","Ehsan Shareghi","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2410.23861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23663v1","updated":"2024-10-31T06:26:00Z","published":"2024-10-31T06:26:00Z","title":"DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake\n Detection","summary":" With the advancement of deepfake generation techniques, the importance of\ndeepfake detection in protecting multimedia content integrity has become\nincreasingly obvious. Recently, temporal inconsistency clues have been explored\nto improve the generalizability of deepfake video detection. According to our\nobservation, the temporal artifacts of forged videos in terms of motion\ninformation usually exhibits quite distinct inconsistency patterns along\nhorizontal and vertical directions, which could be leveraged to improve the\ngeneralizability of detectors. In this paper, a transformer-based framework for\nDiffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits\ndirectional inconsistencies for deepfake video detection. Specifically, DIP\nbegins with a spatiotemporal encoder to represent spatiotemporal information. A\ndirectional inconsistency decoder is adopted accordingly, where direction-aware\nattention and inconsistency diffusion are incorporated to explore potential\ninconsistency patterns and jointly learn the inherent relationships. In\naddition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to\ncontrast spatiotemporally augmented sample pairs and prevent the model from\noverfitting nonessential forgery artifacts. Extensive experiments on several\npublic datasets demonstrate that our method could effectively identify\ndirectional forgery clues and achieve state-of-the-art performance.\n","authors":["Fan Nie","Jiangqun Ni","Jian Zhang","Bin Zhang","Weizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23663v1.pdf","comment":"13 pages, accepted with IEEE Trans. on Multimedia"},{"id":"http://arxiv.org/abs/2410.23230v2","updated":"2024-10-31T04:20:22Z","published":"2024-10-30T17:18:53Z","title":"Aligning Audio-Visual Joint Representations with an Agentic Workflow","summary":" Visual content and accompanied audio signals naturally formulate a joint\nrepresentation to improve audio-visual (AV) related applications. While studies\ndevelop various AV representation learning frameworks, the importance of AV\ndata alignment is usually undermined for achieving high-quality representation.\nWe observe that an audio signal may contain background noise interference.\nAlso, non-synchronization may appear between audio and video streams. These\nnon-strict data alignment limits representation quality and downgrade\napplication performance. In this paper, we propose to improve AV joint\nrepresentations from a data-centric perspective by aligning audio signals to\nvisual data. Our alignment is conducted in an agentic workflow controlled by an\nLLM-based assistant named AVAgent. For each input AV data pair, our AVAgent\nuses a multi-modal LLM to convert audio and visual data into language\ndescriptions separately (i.e., tool use). Then, AVAgent reasons whether this\npaired data is aligned well and plans to edit the audio signal if needed (i.e.,\nplanning). The audio editing is executed by predefined actions that filter\nnoise or augment data. Moreover, we use a VLM to evaluate how modified audio\nsignals match the visual content and provide feedback to AVAgent (i.e.,\nreflection). The tool use, planning, and reflection steps operate cyclically to\nbecome an agentic workflow where audio signals are gradually aligned to visual\ncontent. To this end, existing methods can directly leverage the aligned AV\ndata via our agentic workflow to improve AV joint representations. The\nexperimental results comprehensively demonstrate the state-of-the-art\nperformance of the proposed approach against previous baselines in diverse\ndownstream tasks.\n","authors":["Shentong Mo","Yibing Song"],"pdf_url":"https://arxiv.org/pdf/2410.23230v2.pdf","comment":null}]},"2024-10-30T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.16408v2","updated":"2024-10-30T22:35:58Z","published":"2024-09-24T19:17:15Z","title":"Modern Hopfield Networks meet Encoded Neural Representations --\n Addressing Practical Considerations","summary":" Content-addressable memories such as Modern Hopfield Networks (MHN) have been\nstudied as mathematical models of auto-association and storage/retrieval in the\nhuman declarative memory, yet their practical use for large-scale content\nstorage faces challenges. Chief among them is the occurrence of meta-stable\nstates, particularly when handling large amounts of high dimensional content.\nThis paper introduces Hopfield Encoding Networks (HEN), a framework that\nintegrates encoded neural representations into MHNs to improve pattern\nseparability and reduce meta-stable states. We show that HEN can also be used\nfor retrieval in the context of hetero association of images with natural\nlanguage queries, thus removing the limitation of requiring access to partial\ncontent in the same domain. Experimental results demonstrate substantial\nreduction in meta-stable states and increased storage capacity while still\nenabling perfect recall of a significantly larger number of inputs advancing\nthe practical utility of associative memory networks for real-world tasks.\n","authors":["Satyananda Kashyap","Niharika S. D'Souza","Luyao Shi","Ken C. L. Wong","Hongzhi Wang","Tanveer Syeda-Mahmood"],"pdf_url":"https://arxiv.org/pdf/2409.16408v2.pdf","comment":"17 pages, 8 figures, accepted as a workshop paper at UniReps @\n Neurips 2024"},{"id":"http://arxiv.org/abs/2410.23437v1","updated":"2024-10-30T20:28:10Z","published":"2024-10-30T20:28:10Z","title":"Mind the Gap: A Generalized Approach for Cross-Modal Embedding Alignment","summary":" Retrieval-Augmented Generation (RAG) systems enhance text generation by\nincorporating external knowledge but often struggle when retrieving context\nacross different text modalities due to semantic gaps. We introduce a\ngeneralized projection-based method, inspired by adapter modules in transfer\nlearning, that efficiently bridges these gaps between various text types, such\nas programming code and pseudocode, or English and French sentences. Our\napproach emphasizes speed, accuracy, and data efficiency, requiring minimal\nresources for training and inference. By aligning embeddings from heterogeneous\ntext modalities into a unified space through a lightweight projection network,\nour model significantly outperforms traditional retrieval methods like the\nOkapi BM25 algorithm and models like Dense Passage Retrieval (DPR), while\napproaching the accuracy of Sentence Transformers. Extensive evaluations\ndemonstrate the effectiveness and generalizability of our method across\ndifferent tasks, highlighting its potential for real-time, resource-constrained\napplications.\n","authors":["Arihan Yadav","Alan McMillan"],"pdf_url":"https://arxiv.org/pdf/2410.23437v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.10725v3","updated":"2024-10-30T19:42:57Z","published":"2024-05-17T12:15:07Z","title":"INDUS: Effective and Efficient Language Models for Scientific\n Applications","summary":" Large language models (LLMs) trained on general domain corpora showed\nremarkable results on natural language processing (NLP) tasks. However,\nprevious research demonstrated LLMs trained using domain-focused corpora\nperform better on specialized tasks. Inspired by this insight, we developed\nINDUS, a comprehensive suite of LLMs tailored for the closely-related domains\nof Earth science, biology, physics, heliophysics, planetary sciences and\nastrophysics, and trained using curated scientific corpora drawn from diverse\ndata sources. The suite of models include: (1) an encoder model trained using\ndomain-specific vocabulary and corpora to address NLP tasks, (2) a\ncontrastive-learning based text embedding model trained using a diverse set of\ndatasets to address information retrieval tasks and (3) smaller versions of\nthese models created using knowledge distillation for applications which have\nlatency or resource constraints. We also created three new scientific benchmark\ndatasets, CLIMATE-CHANGE NER (entity-recognition), NASA-QA (extractive QA) and\nNASA-IR (IR) to accelerate research in these multi-disciplinary fields. We show\nthat our models outperform both general-purpose (RoBERTa) and domain-specific\n(SCIBERT) encoders on these new tasks as well as existing tasks in the domains\nof interest. Furthermore, we demonstrate the use of these models in two\nindustrial settings -- as a retrieval model for large-scale vector search\napplications and in automatic content tagging systems.\n","authors":["Bishwaranjan Bhattacharjee","Aashka Trivedi","Masayasu Muraoka","Muthukumaran Ramasubramanian","Takuma Udagawa","Iksha Gurung","Nishan Pantha","Rong Zhang","Bharath Dandala","Rahul Ramachandran","Manil Maskey","Kaylin Bugbee","Mike Little","Elizabeth Fancher","Irina Gerasimov","Armin Mehrabian","Lauren Sanders","Sylvain Costes","Sergi Blanco-Cuaresma","Kelly Lockhart","Thomas Allen","Felix Grezes","Megan Ansdell","Alberto Accomazzi","Yousef El-Kurdi","Davis Wertheimer","Birgit Pfitzmann","Cesar Berrospi Ramis","Michele Dolfi","Rafael Teixeira de Lima","Panagiotis Vagenas","S. Karthik Mukkavilli","Peter Staar","Sanaz Vahidinia","Ryan McGranaghan","Tsendgar Lee"],"pdf_url":"https://arxiv.org/pdf/2405.10725v3.pdf","comment":"EMNLP 2024 (Industry Track)"},{"id":"http://arxiv.org/abs/2402.13959v3","updated":"2024-10-30T17:16:49Z","published":"2024-02-21T17:41:17Z","title":"Retention Induced Biases in a Recommendation System with Heterogeneous\n Users","summary":" I examine a conceptual model of a recommendation system (RS) with user inflow\nand churn dynamics. When inflow and churn balance out, the user distribution\nreaches a steady state. Changing the recommendation algorithm alters the steady\nstate and creates a transition period. During this period, the RS behaves\ndifferently from its new steady state. In particular, A/B experiment metrics\nobtained in transition periods are biased indicators of the RS's long-term\nperformance. Scholars and practitioners, however, often conduct A/B tests\nshortly after introducing new algorithms to validate their effectiveness. This\nA/B experiment paradigm, widely regarded as the gold standard for assessing RS\nimprovements, may consequently yield false conclusions. I also briefly touch on\nthe data bias caused by the user retention dynamics.\n","authors":["Shichao Ma"],"pdf_url":"https://arxiv.org/pdf/2402.13959v3.pdf","comment":"This preprint has not undergone peer review (when applicable) or any\n post-submission improvements or corrections. The Version of Record of this\n contribution is published in advances in Bias and Fairness in Information\n Retrieval. BIAS 2024. Communications in Computer and Information Science, vol\n 2227. Springer, and is available online at\n https://doi.org/10.1007/978-3-031-71975-2_2"},{"id":"http://arxiv.org/abs/2410.23180v1","updated":"2024-10-30T16:37:04Z","published":"2024-10-30T16:37:04Z","title":"ReasoningRec: Bridging Personalized Recommendations and\n Human-Interpretable Explanations through LLM Reasoning","summary":" This paper presents ReasoningRec, a reasoning-based recommendation framework\nthat leverages Large Language Models (LLMs) to bridge the gap between\nrecommendations and human-interpretable explanations. In contrast to\nconventional recommendation systems that rely on implicit user-item\ninteractions, ReasoningRec employs LLMs to model users and items, focusing on\npreferences, aversions, and explanatory reasoning. The framework utilizes a\nlarger LLM to generate synthetic explanations for user preferences,\nsubsequently used to fine-tune a smaller LLM for enhanced recommendation\naccuracy and human-interpretable explanation. Our experimental study\ninvestigates the impact of reasoning and contextual information on personalized\nrecommendations, revealing that the quality of contextual and personalized data\nsignificantly influences the LLM's capacity to generate plausible explanations.\nEmpirical evaluations demonstrate that ReasoningRec surpasses state-of-the-art\nmethods by up to 12.5\\% in recommendation prediction while concurrently\nproviding human-intelligible explanations. The code is available here:\nhttps://github.com/millenniumbismay/reasoningrec.\n","authors":["Millennium Bismay","Xiangjue Dong","James Caverlee"],"pdf_url":"https://arxiv.org/pdf/2410.23180v1.pdf","comment":"Large Language Model, Recommendation, Human-Interpretable Reasoning,\n Personalization"},{"id":"http://arxiv.org/abs/2410.23166v1","updated":"2024-10-30T16:18:22Z","published":"2024-10-30T16:18:22Z","title":"SciPIP: An LLM-based Scientific Paper Idea Proposer","summary":" The exponential growth of knowledge and the increasing complexity of\ninterdisciplinary research pose significant challenges for researchers,\nincluding information overload and difficulties in exploring novel ideas. The\nadvancements in large language models (LLMs), such as GPT-4, have shown great\npotential in enhancing idea proposals, but how to effectively utilize large\nmodels for reasonable idea proposal has not been thoroughly explored. This\npaper proposes a scientific paper idea proposer (SciPIP). Based on a\nuser-provided research background, SciPIP retrieves helpful papers from a\nliterature database while leveraging the capabilities of LLMs to generate more\nnovel and feasible ideas. To this end, 1) we construct a literature retrieval\ndatabase, extracting lots of papers' multi-dimension information for fast\naccess. Then, a literature retrieval method based on semantics, entity, and\ncitation co-occurrences is proposed to search relevant literature from multiple\naspects based on the user-provided background. 2) After literature retrieval,\nwe introduce dual-path idea proposal strategies, where one path infers\nsolutions from the retrieved literature and the other path generates original\nideas through model brainstorming. We then combine the two to achieve a good\nbalance between feasibility and originality. Through extensive experiments on\nthe natural language processing (NLP) field, we demonstrate that SciPIP can\nretrieve citations similar to those of existing top conference papers and\ngenerate many ideas consistent with them. Additionally, we evaluate the\noriginality of other ideas generated by SciPIP using large language models,\nfurther validating the effectiveness of our proposed method. The code and the\ndatabase are released at https://github.com/cheerss/SciPIP.\n","authors":["Wenxiao Wang","Lihui Gu","Liye Zhang","Yunxiang Luo","Yi Dai","Chen Shen","Liang Xie","Binbin Lin","Xiaofei He","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2410.23166v1.pdf","comment":"25 pages, 5 figures, 19 tables"},{"id":"http://arxiv.org/abs/2410.23136v1","updated":"2024-10-30T15:48:36Z","published":"2024-10-30T15:48:36Z","title":"Real-Time Personalization for LLM-based Recommendation with Customized\n In-Context Learning","summary":" Frequently updating Large Language Model (LLM)-based recommender systems to\nadapt to new user interests -- as done for traditional ones -- is impractical\ndue to high training costs, even with acceleration methods. This work explores\nadapting to dynamic user interests without any model updates by leveraging\nIn-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot\nexamples provided in the input. Using new-interest examples as the ICL few-shot\nexamples, LLMs may learn real-time interest directly, avoiding the need for\nmodel updates. However, existing LLM-based recommenders often lose the\nin-context learning ability during recommendation tuning, while the original\nLLM's in-context learning lacks recommendation-specific focus. To address this,\nwe propose RecICL, which customizes recommendation-specific in-context learning\nfor real-time recommendations. RecICL organizes training examples in an\nin-context learning format, ensuring that in-context learning ability is\npreserved and aligned with the recommendation task during tuning.\n Extensive experiments demonstrate RecICL's effectiveness in delivering\nreal-time recommendations without requiring model updates. Our code is\navailable at https://github.com/ym689/rec_icl.\n","authors":["Keqin Bao","Ming Yan","Yang Zhang","Jizhi Zhang","Wenjie Wang","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2410.23136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23090v1","updated":"2024-10-30T15:06:32Z","published":"2024-10-30T15:06:32Z","title":"CORAL: Benchmarking Multi-turn Conversational Retrieval-Augmentation\n Generation","summary":" Retrieval-Augmented Generation (RAG) has become a powerful paradigm for\nenhancing large language models (LLMs) through external knowledge retrieval.\nDespite its widespread attention, existing academic research predominantly\nfocuses on single-turn RAG, leaving a significant gap in addressing the\ncomplexities of multi-turn conversations found in real-world applications. To\nbridge this gap, we introduce CORAL, a large-scale benchmark designed to assess\nRAG systems in realistic multi-turn conversational settings. CORAL includes\ndiverse information-seeking conversations automatically derived from Wikipedia\nand tackles key challenges such as open-domain coverage, knowledge intensity,\nfree-form responses, and topic shifts. It supports three core tasks of\nconversational RAG: passage retrieval, response generation, and citation\nlabeling. We propose a unified framework to standardize various conversational\nRAG methods and conduct a comprehensive evaluation of these methods on CORAL,\ndemonstrating substantial opportunities for improving existing approaches.\n","authors":["Yiruo Cheng","Kelong Mao","Ziliang Zhao","Guanting Dong","Hongjin Qian","Yongkang Wu","Tetsuya Sakai","Ji-Rong Wen","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2410.23090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08615v3","updated":"2024-10-30T14:56:09Z","published":"2022-03-16T13:31:48Z","title":"Scientific and Technological Information Oriented Semantics-adversarial\n and Media-adversarial Cross-media Retrieval","summary":" Cross-media retrieval of scientific and technological information is one of\nthe important tasks in the cross-media study. Cross-media scientific and\ntechnological information retrieval obtain target information from massive\nmulti-source and heterogeneous scientific and technological resources, which\nhelps to design applications that meet users' needs, including scientific and\ntechnological information recommendation, personalized scientific and\ntechnological information retrieval, etc. The core of cross-media retrieval is\nto learn a common subspace, so that data from different media can be directly\ncompared with each other after being mapped into this subspace. In subspace\nlearning, existing methods often focus on modeling the discrimination of\nintra-media data and the invariance of inter-media data after mapping; however,\nthey ignore the semantic consistency of inter-media data before and after\nmapping and media discrimination of intra-semantics data, which limit the\nresult of cross-media retrieval. In light of this, we propose a scientific and\ntechnological information oriented Semantics-adversarial and Media-adversarial\nCross-media Retrieval method (SMCR) to find an effective common subspace.\nSpecifically, SMCR minimizes the loss of inter-media semantic consistency in\naddition to modeling intra-media semantic discrimination, to preserve semantic\nsimilarity before and after mapping. Furthermore, SMCR constructs a basic\nfeature mapping network and a refined feature mapping network to jointly\nminimize the media discriminative loss within semantics, so as to enhance the\nfeature mapping network's ability to confuse the media discriminant network.\nExperimental results on two datasets demonstrate that the proposed SMCR\noutperforms state-of-the-art methods in cross-media retrieval.\n","authors":["Ang Li","Junping Du","Feifei Kou","Zhe Xue","Xin Xu","Mingying Xu","Yang Jiang"],"pdf_url":"https://arxiv.org/pdf/2203.08615v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2410.23023v1","updated":"2024-10-30T13:53:46Z","published":"2024-10-30T13:53:46Z","title":"A Universal Sets-level Optimization Framework for Next Set\n Recommendation","summary":" Next Set Recommendation (NSRec), encompassing related tasks such as next\nbasket recommendation and temporal sets prediction, stands as a trending\nresearch topic. Although numerous attempts have been made on this topic, there\nare certain drawbacks: (i) Existing studies are still confined to utilizing\nobjective functions commonly found in Next Item Recommendation (NIRec), such as\nbinary cross entropy and BPR, which are calculated based on individual item\ncomparisons; (ii) They place emphasis on building sophisticated learning models\nto capture intricate dependency relationships across sequential sets, but\nfrequently overlook pivotal dependency in their objective functions; (iii)\nDiversity factor within sequential sets is frequently overlooked. In this\nresearch, we endeavor to unveil a universal and S ets-level optimization\nframework for N ext Set Recommendation (SNSRec), offering a holistic fusion of\ndiversity distribution and intricate dependency relationships within temporal\nsets. To realize this, the following contributions are made: (i) We directly\nmodel the temporal set in a sequence as a cohesive entity, leveraging the\nStructured Determinantal Point Process (SDPP), wherein the probabilistic DPP\ndistribution prioritizes collections of structures (sequential sets) instead of\nindividual items; (ii) We introduce a co-occurrence representation to discern\nand acknowledge the importance of different sets; (iii) We propose a sets-level\noptimization criterion, which integrates the diversity distribution and\ndependency relations across the entire sequence of sets, guiding the model to\nrecommend relevant and diversified set. Extensive experiments on real-world\ndatasets show that our approach consistently outperforms previous methods on\nboth relevance and diversity.\n","authors":["Yuli Liu","Min Liu","Christian Walder","Lexing Xie"],"pdf_url":"https://arxiv.org/pdf/2410.23023v1.pdf","comment":"Accepter at CIKM2024"},{"id":"http://arxiv.org/abs/2410.22972v1","updated":"2024-10-30T12:39:39Z","published":"2024-10-30T12:39:39Z","title":"DataRec: A Framework for Standardizing Recommendation Data Processing\n and Analysis","summary":" Thanks to the great interest posed by researchers and companies,\nrecommendation systems became a cornerstone of machine learning applications.\nHowever, concerns have arisen recently about the need for reproducibility,\nmaking it challenging to identify suitable pipelines. Several frameworks have\nbeen proposed to improve reproducibility, covering the entire process from data\nreading to performance evaluation. Despite this effort, these solutions often\noverlook the role of data management, do not promote interoperability, and\nneglect data analysis despite its well-known impact on recommender performance.\nTo address these gaps, we propose DataRec, which facilitates using and\nmanipulating recommendation datasets. DataRec supports reading and writing in\nvarious formats, offers filtering and splitting techniques, and enables data\ndistribution analysis using well-known metrics. It encourages a unified\napproach to data manipulation by allowing data export in formats compatible\nwith several recommendation frameworks.\n","authors":["Alberto Carlo Maria Mancino","Salvatore Bufi","Angela Di Fazio","Daniele Malitesta","Claudio Pomo","Antonio Ferrara","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2410.22972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22844v1","updated":"2024-10-30T09:23:14Z","published":"2024-10-30T09:23:14Z","title":"Understanding and Improving Adversarial Collaborative Filtering for\n Robust Recommendation","summary":" Adversarial Collaborative Filtering (ACF), which typically applies\nadversarial perturbations at user and item embeddings through adversarial\ntraining, is widely recognized as an effective strategy for enhancing the\nrobustness of Collaborative Filtering (CF) recommender systems against\npoisoning attacks. Besides, numerous studies have empirically shown that ACF\ncan also improve recommendation performance compared to traditional CF. Despite\nthese empirical successes, the theoretical understanding of ACF's effectiveness\nin terms of both performance and robustness remains unclear. To bridge this\ngap, in this paper, we first theoretically show that ACF can achieve a lower\nrecommendation error compared to traditional CF with the same training epochs\nin both clean and poisoned data contexts. Furthermore, by establishing bounds\nfor reductions in recommendation error during ACF's optimization process, we\nfind that applying personalized magnitudes of perturbation for different users\nbased on their embedding scales can further improve ACF's effectiveness.\nBuilding on these theoretical understandings, we propose Personalized Magnitude\nAdversarial Collaborative Filtering (PamaCF). Extensive experiments demonstrate\nthat PamaCF effectively defends against various types of poisoning attacks\nwhile significantly enhancing recommendation performance.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.22844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22832v1","updated":"2024-10-30T09:15:51Z","published":"2024-10-30T09:15:51Z","title":"HijackRAG: Hijacking Attacks against Retrieval-Augmented Large Language\n Models","summary":" Retrieval-Augmented Generation (RAG) systems enhance large language models\n(LLMs) by integrating external knowledge, making them adaptable and\ncost-effective for various applications. However, the growing reliance on these\nsystems also introduces potential security risks. In this work, we reveal a\nnovel vulnerability, the retrieval prompt hijack attack (HijackRAG), which\nenables attackers to manipulate the retrieval mechanisms of RAG systems by\ninjecting malicious texts into the knowledge database. When the RAG system\nencounters target questions, it generates the attacker's pre-determined answers\ninstead of the correct ones, undermining the integrity and trustworthiness of\nthe system. We formalize HijackRAG as an optimization problem and propose both\nblack-box and white-box attack strategies tailored to different levels of the\nattacker's knowledge. Extensive experiments on multiple benchmark datasets show\nthat HijackRAG consistently achieves high attack success rates, outperforming\nexisting baseline attacks. Furthermore, we demonstrate that the attack is\ntransferable across different retriever models, underscoring the widespread\nrisk it poses to RAG systems. Lastly, our exploration of various defense\nmechanisms reveals that they are insufficient to counter HijackRAG, emphasizing\nthe urgent need for more robust security measures to protect RAG systems in\nreal-world deployments.\n","authors":["Yucheng Zhang","Qinfeng Li","Tianyu Du","Xuhong Zhang","Xinkui Zhao","Zhengwen Feng","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2410.22832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22809v1","updated":"2024-10-30T08:41:13Z","published":"2024-10-30T08:41:13Z","title":"Causality-Enhanced Behavior Sequence Modeling in LLMs for Personalized\n Recommendation","summary":" Recent advancements in recommender systems have focused on leveraging Large\nLanguage Models (LLMs) to improve user preference modeling, yielding promising\noutcomes. However, current LLM-based approaches struggle to fully leverage user\nbehavior sequences, resulting in suboptimal preference modeling for\npersonalized recommendations. In this study, we propose a novel Counterfactual\nFine-Tuning (CFT) method to address this issue by explicitly emphasizing the\nrole of behavior sequences when generating recommendations. Specifically, we\nemploy counterfactual reasoning to identify the causal effects of behavior\nsequences on model output and introduce a task that directly fits the\nground-truth labels based on these effects, achieving the goal of explicit\nemphasis. Additionally, we develop a token-level weighting mechanism to adjust\nthe emphasis strength for different item tokens, reflecting the diminishing\ninfluence of behavior sequences from earlier to later tokens during predicting\nan item. Extensive experiments on real-world datasets demonstrate that CFT\neffectively improves behavior sequence modeling. Our codes are available at\nhttps://github.com/itsmeyjt/CFT.\n","authors":["Yang Zhang","Juntao You","Yimeng Bai","Jizhi Zhang","Keqin Bao","Wenjie Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.22809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22790v1","updated":"2024-10-30T08:09:33Z","published":"2024-10-30T08:09:33Z","title":"Dual Contrastive Transformer for Hierarchical Preference Modeling in\n Sequential Recommendation","summary":" Sequential recommender systems (SRSs) aim to predict the subsequent items\nwhich may interest users via comprehensively modeling users' complex preference\nembedded in the sequence of user-item interactions. However, most of existing\nSRSs often model users' single low-level preference based on item ID\ninformation while ignoring the high-level preference revealed by item attribute\ninformation, such as item category. Furthermore, they often utilize limited\nsequence context information to predict the next item while overlooking richer\ninter-item semantic relations. To this end, in this paper, we proposed a novel\nhierarchical preference modeling framework to substantially model the complex\nlow- and high-level preference dynamics for accurate sequential recommendation.\nSpecifically, in the framework, a novel dual-transformer module and a novel\ndual contrastive learning scheme have been designed to discriminatively learn\nusers' low- and high-level preference and to effectively enhance both low- and\nhigh-level preference learning respectively. In addition, a novel\nsemantics-enhanced context embedding module has been devised to generate more\ninformative context embedding for further improving the recommendation\nperformance. Extensive experiments on six real-world datasets have demonstrated\nboth the superiority of our proposed method over the state-of-the-art ones and\nthe rationality of our design.\n","authors":["Chengkai Huang","Shoujin Wang","Xianzhi Wang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2410.22790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19453v4","updated":"2024-10-30T07:04:25Z","published":"2023-10-30T11:25:03Z","title":"FLIP: Fine-grained Alignment between ID-based Models and Pretrained\n Language Models for CTR Prediction","summary":" Click-through rate (CTR) prediction plays as a core function module in\nvarious personalized online services. The traditional ID-based models for CTR\nprediction take as inputs the one-hot encoded ID features of tabular modality,\nwhich capture the collaborative signals via feature interaction modeling. But\nthe one-hot encoding discards the semantic information included in the textual\nfeatures. Recently, the emergence of Pretrained Language Models(PLMs) has given\nrise to another paradigm, which takes as inputs the sentences of textual\nmodality obtained by hard prompt templates and adopts PLMs to extract the\nsemantic knowledge. However, PLMs often face challenges in capturing field-wise\ncollaborative signals and distinguishing features with subtle textual\ndifferences. In this paper, to leverage the benefits of both paradigms and\nmeanwhile overcome their limitations, we propose to conduct Fine-grained\nfeature-level ALignment between ID-based Models and Pretrained Language\nModels(FLIP) for CTR prediction. Unlike most methods that solely rely on global\nviews through instance-level contrastive learning, we design a novel jointly\nmasked tabular/language modeling task to learn fine-grained alignment between\ntabular IDs and word tokens. Specifically, the masked data of one modality (IDs\nand tokens) has to be recovered with the help of the other modality, which\nestablishes the feature-level interaction and alignment via sufficient mutual\ninformation extraction between dual modalities. Moreover, we propose to jointly\nfinetune the ID-based model and PLM by adaptively combining the output of both\nmodels, thus achieving superior performance in downstream CTR prediction tasks.\nExtensive experiments on three real-world datasets demonstrate that FLIP\noutperforms SOTA baselines, and is highly compatible with various ID-based\nmodels and PLMs. The code is at \\url{https://github.com/justarter/FLIP}.\n","authors":["Hangyu Wang","Jianghao Lin","Xiangyang Li","Bo Chen","Chenxu Zhu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2310.19453v4.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2406.13249v2","updated":"2024-10-30T06:41:45Z","published":"2024-06-19T06:19:48Z","title":"R^2AG: Incorporating Retrieval Information into Retrieval Augmented\n Generation","summary":" Retrieval augmented generation (RAG) has been applied in many scenarios to\naugment large language models (LLMs) with external documents provided by\nretrievers. However, a semantic gap exists between LLMs and retrievers due to\ndifferences in their training objectives and architectures. This misalignment\nforces LLMs to passively accept the documents provided by the retrievers,\nleading to incomprehension in the generation process, where the LLMs are\nburdened with the task of distinguishing these documents using their inherent\nknowledge. This paper proposes R$^2$AG, a novel enhanced RAG framework to fill\nthis gap by incorporating Retrieval information into Retrieval Augmented\nGeneration. Specifically, R$^2$AG utilizes the nuanced features from the\nretrievers and employs a R$^2$-Former to capture retrieval information. Then, a\nretrieval-aware prompting strategy is designed to integrate retrieval\ninformation into LLMs' generation. Notably, R$^2$AG suits low-source scenarios\nwhere LLMs and retrievers are frozen. Extensive experiments across five\ndatasets validate the effectiveness, robustness, and efficiency of R$^2$AG. Our\nanalysis reveals that retrieval information serves as an anchor to aid LLMs in\nthe generation process, thereby filling the semantic gap.\n","authors":["Fuda Ye","Shuangyin Li","Yongqi Zhang","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2406.13249v2.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2403.00880v2","updated":"2024-10-30T05:18:03Z","published":"2024-03-01T08:50:27Z","title":"CIDGMed: Causal Inference-Driven Medication Recommendation with Enhanced\n Dual-Granularity Learning","summary":" Medication recommendation aims to integrate patients' long-term health\nrecords to provide accurate and safe medication combinations for specific\nhealth states. Existing methods often fail to deeply explore the true causal\nrelationships between diseases/procedures and medications, resulting in biased\nrecommendations. Additionally, in medication representation learning, the\nrelationships between information at different granularities of medications,\ncoarse-grained (medication itself) and fine-grained (molecular level), are not\neffectively integrated, leading to biases in representation learning. To\naddress these limitations, we propose the Causal Inference-driven\nDual-Granularity Medication Recommendation method (CIDGMed). Our approach\nleverages causal inference to uncover the relationships between\ndiseases/procedures and medications, thereby enhancing the rationality and\ninterpretability of recommendations. By integrating coarse-grained medication\neffects with fine-grained molecular structure information, CIDGMed provides a\ncomprehensive representation of medications. Additionally, we employ a bias\ncorrection model during the prediction phase to further refine recommendations,\nensuring both accuracy and safety. Through extensive experiments, CIDGMed\nsignificantly outperforms current state-of-the-art models across multiple\nmetrics, achieving a 2.54% increase in accuracy, a 3.65% reduction in side\neffects, and a 39.42% improvement in time efficiency. Additionally, we\ndemonstrate the rationale of CIDGMed through a case study.\n","authors":["Shunpan Liang","Xiang Li","Shi Mu","Chen Li","Yu Lei","Yulei Hou","Tengfei Ma"],"pdf_url":"https://arxiv.org/pdf/2403.00880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10225v5","updated":"2024-10-30T02:58:14Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Surpassing GPT-4 on Conversational QA and RAG","summary":" In this work, we introduce ChatQA, a suite of models that outperform GPT-4 on\nretrieval-augmented generation (RAG) and conversational question answering\n(QA). To enhance generation, we propose a two-stage instruction tuning method\nthat significantly boosts the performance of RAG. For effective retrieval, we\nintroduce a dense retriever optimized for conversational QA, which yields\nresults comparable to the alternative state-of-the-art query rewriting models,\nwhile substantially reducing deployment costs. We also present the ChatRAG\nBench, which encompasses ten datasets covering comprehensive evaluations on\nRAG, table-related QA, arithmetic calculations, and scenarios involving\nunanswerable questions. Our ChatQA-1.0-70B (score: 54.14), built on Llama2, a\nweaker foundation model than GPT-4, can slightly outperform GPT-4-0613 (score:\n53.90) and GPT-4-Turbo-2024-04-09 (score: 54.03) on the ChatRAG Bench, without\nrelying on any synthetic data from OpenAI GPT models. Notably, the\nLlama3-ChatQA-1.5-70B model surpasses the accuracy of GPT-4-Turbo-2024-04-09,\nachieving a 4.4% improvement. To advance research in this field, we\nopen-sourced the model weights, instruction tuning data, ChatRAG Bench, and\nretriever for the community: https://chatqa-project.github.io/.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Chankyu Lee","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v5.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2212.08841v4","updated":"2024-10-30T02:36:38Z","published":"2022-12-17T10:43:25Z","title":"AugTriever: Unsupervised Dense Retrieval and Domain Adaptation by\n Scalable Data Augmentation","summary":" Dense retrievers have made significant strides in text retrieval and\nopen-domain question answering. However, most of these achievements have relied\nheavily on extensive human-annotated supervision. In this study, we aim to\ndevelop unsupervised methods for improving dense retrieval models. We propose\ntwo approaches that enable annotation-free and scalable training by creating\npseudo querydocument pairs: query extraction and transferred query generation.\nThe query extraction method involves selecting salient spans from the original\ndocument to generate pseudo queries. On the other hand, the transferred query\ngeneration method utilizes generation models trained for other NLP tasks, such\nas summarization, to produce pseudo queries. Through extensive experimentation,\nwe demonstrate that models trained using these augmentation methods can achieve\ncomparable, if not better, performance than multiple strong dense baselines.\nMoreover, combining these strategies leads to further improvements, resulting\nin superior performance of unsupervised dense retrieval, unsupervised domain\nadaptation and supervised finetuning, benchmarked on both BEIR and ODQA\ndatasets. Code and datasets are publicly available at\nhttps://github.com/salesforce/AugTriever.\n","authors":["Rui Meng","Ye Liu","Semih Yavuz","Divyansh Agarwal","Lifu Tu","Ning Yu","Jianguo Zhang","Meghana Bhat","Yingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2212.08841v4.pdf","comment":"DCAI24, October 25, 2024, Boise, ID"},{"id":"http://arxiv.org/abs/2311.08593v2","updated":"2024-10-30T01:26:09Z","published":"2023-11-14T23:28:36Z","title":"Summarization-Based Document IDs for Generative Retrieval with Language\n Models","summary":" Generative retrieval (Wang et al., 2022; Tay et al., 2022) is a popular\napproach for end-to-end document retrieval that directly generates document\nidentifiers given an input query. We introduce summarization-based document\nIDs, in which each document's ID is composed of an extractive summary or\nabstractive keyphrases generated by a language model, rather than an integer ID\nsequence or bags of n-grams as proposed in past work. We find that abstractive,\ncontent-based IDs (ACID) and an ID based on the first 30 tokens are very\neffective in direct comparisons with previous approaches to ID creation. We\nshow that using ACID improves top-10 and top-20 recall by 15.6% and 14.4%\n(relative) respectively versus the cluster-based integer ID baseline on the\nMSMARCO 100k retrieval task, and 9.8% and 9.9% respectively on the\nWikipedia-based NQ 100k retrieval task. Our results demonstrate the\neffectiveness of human-readable, natural-language IDs created through\nsummarization for generative retrieval. We also observed that extractive IDs\noutperformed abstractive IDs on Wikipedia articles in NQ but not the snippets\nin MSMARCO, which suggests that document characteristics affect generative\nretrieval performance.\n","authors":["Haoxin Li","Daniel Cheng","Phillip Keung","Jungo Kasai","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2311.08593v2.pdf","comment":"To appear at the NLP for Wikipedia Workshop in EMNLP 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.23479v1","updated":"2024-10-30T22:06:14Z","published":"2024-10-30T22:06:14Z","title":"The Trail Making Test in Virtual Reality (TMT-VR): The Effects of\n Interaction Modes and Gaming Skills on Cognitive Performance of Young Adults","summary":" Virtual Reality (VR) is increasingly used in neuropsychological assessments\ndue to its ability to simulate real-world environments. This study aimed to\ndevelop and evaluate the Trail Making Test in VR (TMT-VR) and investigate the\neffects of different interaction modes and gaming skills on cognitive\nperformance. A total of 71 young female and male adults (aged 18-35) with high\nand low gaming skills participated in this study. Participants completed the\nTMT-VR using three interaction modes as follows: eye tracking, head movement,\nand controller. Performance metrics included task completion time and accuracy.\nUser experience, usability, and acceptability of TMT-VR were also examined.\nResults showed that both eye tracking and head movement modes significantly\noutperformed the controller in terms of task completion time and accuracy. No\nsignificant differences were found between eye tracking and head movement\nmodes. Gaming skills did not significantly influence task performance using any\ninteraction mode. The TMT-VR demonstrates high usability, acceptability, and\nuser experience among participants. The findings suggest that VR-based\nassessments can effectively measure cognitive performance without being\ninfluenced by prior gaming skills, indicating potential applicability for\ndiverse populations.\n","authors":["Evgenia Giatzoglou","Panagiotis Vorias","Ryan Kemm","Irene Karayianni","Chrysanthi Nega","Panagiotis Kourtesis"],"pdf_url":"https://arxiv.org/pdf/2410.23479v1.pdf","comment":"25 Pages, 7 Figures, 4 Tables"},{"id":"http://arxiv.org/abs/2406.14515v3","updated":"2024-10-30T13:38:10Z","published":"2024-06-20T17:26:01Z","title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video\n Understanding","summary":" The advent of large vision-language models (LVLMs) has spurred research into\ntheir applications in multi-modal contexts, particularly in video\nunderstanding. Traditional VideoQA benchmarks, despite providing quantitative\nmetrics, often fail to encompass the full spectrum of video content and\ninadequately assess models' temporal comprehension. To address these\nlimitations, we introduce MMBench-Video, a quantitative benchmark designed to\nrigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video\nincorporates lengthy videos from YouTube and employs free-form questions,\nmirroring practical use cases. The benchmark is meticulously crafted to probe\nthe models' temporal reasoning skills, with all questions human-annotated\naccording to a carefully constructed ability taxonomy. We employ GPT-4 for\nautomated assessment, demonstrating superior accuracy and robustness over\nearlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted\ncomprehensive evaluations that include both proprietary and open-source LVLMs\nfor images and videos. MMBench-Video stands as a valuable resource for the\nresearch community, facilitating improved evaluation of LVLMs and catalyzing\nprogress in the field of video understanding. The evalutation code of\nMMBench-Video will be integrated into VLMEvalKit:\nhttps://github.com/open-compass/VLMEvalKit.\n","authors":["Xinyu Fang","Kangrui Mao","Haodong Duan","Xiangyu Zhao","Yining Li","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14515v3.pdf","comment":"Accepted in NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2410.23325v1","updated":"2024-10-30T13:17:13Z","published":"2024-10-30T13:17:13Z","title":"Transfer Learning in Vocal Education: Technical Evaluation of Limited\n Samples Describing Mezzo-soprano","summary":" Vocal education in the music field is difficult to quantify due to the\nindividual differences in singers' voices and the different quantitative\ncriteria of singing techniques. Deep learning has great potential to be applied\nin music education due to its efficiency to handle complex data and perform\nquantitative analysis. However, accurate evaluations with limited samples over\nrare vocal types, such as Mezzo-soprano, requires extensive well-annotated data\nsupport using deep learning models. In order to attain the objective, we\nperform transfer learning by employing deep learning models pre-trained on the\nImageNet and Urbansound8k datasets for the improvement on the precision of\nvocal technique evaluation. Furthermore, we tackle the problem of the lack of\nsamples by constructing a dedicated dataset, the Mezzo-soprano Vocal Set (MVS),\nfor vocal technique assessment. Our experimental results indicate that transfer\nlearning increases the overall accuracy (OAcc) of all models by an average of\n8.3%, with the highest accuracy at 94.2%. We not only provide a novel approach\nto evaluating Mezzo-soprano vocal techniques but also introduce a new\nquantitative assessment method for music education.\n","authors":["Zhenyi Hou","Xu Zhao","Kejie Ye","Xinyu Sheng","Shanggerile Jiang","Jiajing Xia","Yitao Zhang","Chenxi Ban","Daijun Luo","Jiaxing Chen","Yan Zou","Yuchao Feng","Guangyu Fan","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2410.23325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22803v1","updated":"2024-10-30T08:31:58Z","published":"2024-10-30T08:31:58Z","title":"DOA-Aware Audio-Visual Self-Supervised Learning for Sound Event\n Localization and Detection","summary":" This paper describes sound event localization and detection (SELD) for\nspatial audio recordings captured by firstorder ambisonics (FOA) microphones.\nIn this task, one may train a deep neural network (DNN) using FOA data\nannotated with the classes and directions of arrival (DOAs) of sound events.\nHowever, the performance of this approach is severely bounded by the amount of\nannotated data. To overcome this limitation, we propose a novel method of\npretraining the feature extraction part of the DNN in a self-supervised manner.\nWe use spatial audio-visual recordings abundantly available as virtual reality\ncontents. Assuming that sound objects are concurrently observed by the FOA\nmicrophones and the omni-directional camera, we jointly train audio and visual\nencoders with contrastive learning such that the audio and visual embeddings of\nthe same recording and DOA are made close. A key feature of our method is that\nthe DOA-wise audio embeddings are jointly extracted from the raw audio data,\nwhile the DOA-wise visual embeddings are separately extracted from the local\nvisual crops centered on the corresponding DOA. This encourages the latent\nfeatures of the audio encoder to represent both the classes and DOAs of sound\nevents. The experiment using the DCASE2022 Task 3 dataset of 20 hours shows\nnon-annotated audio-visual recordings of 100 hours reduced the error score of\nSELD from 36.4 pts to 34.9 pts.\n","authors":["Yoto Fujita","Yoshiaki Bando","Keisuke Imoto","Masaki Onishi","Kazuyoshi Yoshii"],"pdf_url":"https://arxiv.org/pdf/2410.22803v1.pdf","comment":"Accepted to APSIPA2023"},{"id":"http://arxiv.org/abs/2410.22023v2","updated":"2024-10-30T04:29:42Z","published":"2024-10-29T13:13:30Z","title":"Feature distribution Adaptation Network for Speech Emotion Recognition","summary":" In this paper, we propose a novel deep inductive transfer learning framework,\nnamed feature distribution adaptation network, to tackle the challenging\nmulti-modal speech emotion recognition problem. Our method aims to use deep\ntransfer learning strategies to align visual and audio feature distributions to\nobtain consistent representation of emotion, thereby improving the performance\nof speech emotion recognition. In our model, the pre-trained ResNet-34 is\nutilized for feature extraction for facial expression images and acoustic Mel\nspectrograms, respectively. Then, the cross-attention mechanism is introduced\nto model the intrinsic similarity relationships of multi-modal features.\nFinally, the multi-modal feature distribution adaptation is performed\nefficiently with feed-forward network, which is extended using the local\nmaximum mean discrepancy loss. Experiments are carried out on two benchmark\ndatasets, and the results demonstrate that our model can achieve excellent\nperformance compared with existing ones.\n","authors":["Shaokai Li","Yixuan Ji","Peng Song","Haoqin Sun","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.22023v2.pdf","comment":null}]},"2024-10-29T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.01610v4","updated":"2024-10-29T23:21:49Z","published":"2023-09-04T13:49:48Z","title":"Fairness in Ranking under Disparate Uncertainty","summary":" Ranking is a ubiquitous method for focusing the attention of human evaluators\non a manageable subset of options. Its use as part of human decision-making\nprocesses ranges from surfacing potentially relevant products on an e-commerce\nsite to prioritizing college applications for human review. While ranking can\nmake human evaluation more effective by focusing attention on the most\npromising options, we argue that it can introduce unfairness if the uncertainty\nof the underlying relevance model differs between groups of options.\nUnfortunately, such disparity in uncertainty appears widespread, often to the\ndetriment of minority groups for which relevance estimates can have higher\nuncertainty due to a lack of data or appropriate features. To address this\nfairness issue, we propose Equal-Opportunity Ranking (EOR) as a new fairness\ncriterion for ranking and show that it corresponds to a group-wise fair lottery\namong the relevant options even in the presence of disparate uncertainty. EOR\noptimizes for an even cost burden on all groups, unlike the conventional\nProbability Ranking Principle, and is fundamentally different from existing\nnotions of fairness in rankings, such as demographic parity and proportional\nRooney rule constraints that are motivated by proportional representation\nrelative to group size. To make EOR ranking practical, we present an efficient\nalgorithm for computing it in time $O(n \\log(n))$ and prove its close\napproximation guarantee to the globally optimal solution. In a comprehensive\nempirical evaluation on synthetic data, a US Census dataset, and a real-world\naudit of Amazon search queries, we find that the algorithm reliably guarantees\nEOR fairness while providing effective rankings.\n","authors":["Richa Rastogi","Thorsten Joachims"],"pdf_url":"https://arxiv.org/pdf/2309.01610v4.pdf","comment":"Camera ready version at EAAMO'24"},{"id":"http://arxiv.org/abs/2409.14192v2","updated":"2024-10-29T21:10:59Z","published":"2024-09-21T16:46:15Z","title":"Knowledge in Triples for LLMs: Enhancing Table QA Accuracy with Semantic\n Extraction","summary":" Integrating structured knowledge from tabular formats poses significant\nchallenges within natural language processing (NLP), mainly when dealing with\ncomplex, semi-structured tables like those found in the FeTaQA dataset. These\ntables require advanced methods to interpret and generate meaningful responses\naccurately. Traditional approaches, such as SQL and SPARQL, often fail to fully\ncapture the semantics of such data, especially in the presence of irregular\ntable structures like web tables. This paper addresses these challenges by\nproposing a novel approach that extracts triples straightforward from tabular\ndata and integrates it with a retrieval-augmented generation (RAG) model to\nenhance the accuracy, coherence, and contextual richness of responses generated\nby a fine-tuned GPT-3.5-turbo-0125 model. Our approach significantly\noutperforms existing baselines on the FeTaQA dataset, particularly excelling in\nSacre-BLEU and ROUGE metrics. It effectively generates contextually accurate\nand detailed long-form answers from tables, showcasing its strength in complex\ndata interpretation.\n","authors":["Hossein Sholehrasa","Sanaz Saki Norouzi","Pascal Hitzler","Majid Jaberi-Douraki"],"pdf_url":"https://arxiv.org/pdf/2409.14192v2.pdf","comment":"We are withdrawing this paper to address foundational aspects that\n are critical for ensuring its accuracy and integrity before any potential\n resubmission"},{"id":"http://arxiv.org/abs/2410.22476v1","updated":"2024-10-29T19:10:12Z","published":"2024-10-29T19:10:12Z","title":"A Pointer Network-based Approach for Joint Extraction and Detection of\n Multi-Label Multi-Class Intents","summary":" In task-oriented dialogue systems, intent detection is crucial for\ninterpreting user queries and providing appropriate responses. Existing\nresearch primarily addresses simple queries with a single intent, lacking\neffective systems for handling complex queries with multiple intents and\nextracting different intent spans. Additionally, there is a notable absence of\nmultilingual, multi-intent datasets. This study addresses three critical tasks:\nextracting multiple intent spans from queries, detecting multiple intents, and\ndeveloping a multi-lingual multi-label intent dataset. We introduce a novel\nmulti-label multi-class intent detection dataset (MLMCID-dataset) curated from\nexisting benchmark datasets. We also propose a pointer network-based\narchitecture (MLMCID) to extract intent spans and detect multiple intents with\ncoarse and fine-grained labels in the form of sextuplets. Comprehensive\nanalysis demonstrates the superiority of our pointer network-based system over\nbaseline approaches in terms of accuracy and F1-score across various datasets.\n","authors":["Ankan Mullick","Sombit Bose","Abhilash Nandy","Gajula Sai Chaitanya","Pawan Goyal"],"pdf_url":"https://arxiv.org/pdf/2410.22476v1.pdf","comment":"Accepted at EMNLP 2024 Findings (Long Paper)"},{"id":"http://arxiv.org/abs/2407.09252v3","updated":"2024-10-29T17:34:54Z","published":"2024-07-12T13:30:44Z","title":"Context Embeddings for Efficient Answer Generation in RAG","summary":" Retrieval-Augmented Generation (RAG) allows overcoming the limited knowledge\nof LLMs by extending the input with external information. As a consequence, the\ncontextual inputs to the model become much longer which slows down decoding\ntime directly translating to the time a user has to wait for an answer. We\naddress this challenge by presenting COCOM, an effective context compression\nmethod, reducing long contexts to only a handful of Context Embeddings speeding\nup the generation time by a large margin. Our method allows for different\ncompression rates trading off decoding time for answer quality. Compared to\nearlier methods, COCOM allows for handling multiple contexts more effectively,\nsignificantly reducing decoding time for long inputs. Our method demonstrates a\nspeed-up of up to 5.69 $\\times$ while achieving higher performance compared to\nexisting efficient context compression methods.\n","authors":["David Rau","Shuai Wang","Hervé Déjean","Stéphane Clinchant"],"pdf_url":"https://arxiv.org/pdf/2407.09252v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.22249v1","updated":"2024-10-29T17:13:54Z","published":"2024-10-29T17:13:54Z","title":"Pushing the Performance Envelope of DNN-based Recommendation Systems\n Inference on GPUs","summary":" Personalized recommendation is a ubiquitous application on the internet, with\nmany industries and hyperscalers extensively leveraging Deep Learning\nRecommendation Models (DLRMs) for their personalization needs (like ad serving\nor movie suggestions). With growing model and dataset sizes pushing computation\nand memory requirements, GPUs are being increasingly preferred for executing\nDLRM inference. However, serving newer DLRMs, while meeting acceptable\nlatencies, continues to remain challenging, making traditional deployments\nincreasingly more GPU-hungry, resulting in higher inference serving costs. In\nthis paper, we show that the embedding stage continues to be the primary\nbottleneck in the GPU inference pipeline, leading up to a 3.2x embedding-only\nperformance slowdown.\n To thoroughly grasp the problem, we conduct a detailed microarchitecture\ncharacterization and highlight the presence of low occupancy in the standard\nembedding kernels. By leveraging direct compiler optimizations, we achieve\noptimal occupancy, pushing the performance by up to 53%. Yet, long memory\nlatency stalls continue to exist. To tackle this challenge, we propose\nspecialized plug-and-play-based software prefetching and L2 pinning techniques,\nwhich help in hiding and decreasing the latencies. Further, we propose\ncombining them, as they complement each other. Experimental evaluations using\nA100 GPUs with large models and datasets show that our proposed techniques\nimprove performance by up to 103% for the embedding stage, and up to 77% for\nthe overall DLRM inference pipeline.\n","authors":["Rishabh Jain","Vivek M. Bhasi","Adwait Jog","Anand Sivasubramaniam","Mahmut T. Kandemir","Chita R. Das"],"pdf_url":"https://arxiv.org/pdf/2410.22249v1.pdf","comment":"This work has been accepted in the 57th MICRO\n (https://microarch.org/micro57/program/). Please check appendix for details\n on reproducing our work including codebase and steps"},{"id":"http://arxiv.org/abs/2410.22233v1","updated":"2024-10-29T17:01:05Z","published":"2024-10-29T17:01:05Z","title":"ContextIQ: A Multimodal Expert-Based Video Retrieval System for\n Contextual Advertising","summary":" Contextual advertising serves ads that are aligned to the content that the\nuser is viewing. The rapid growth of video content on social platforms and\nstreaming services, along with privacy concerns, has increased the need for\ncontextual advertising. Placing the right ad in the right context creates a\nseamless and pleasant ad viewing experience, resulting in higher audience\nengagement and, ultimately, better ad monetization. From a technology\nstandpoint, effective contextual advertising requires a video retrieval system\ncapable of understanding complex video content at a very granular level.\nCurrent text-to-video retrieval models based on joint multimodal training\ndemand large datasets and computational resources, limiting their practicality\nand lacking the key functionalities required for ad ecosystem integration. We\nintroduce ContextIQ, a multimodal expert-based video retrieval system designed\nspecifically for contextual advertising. ContextIQ utilizes modality-specific\nexperts-video, audio, transcript (captions), and metadata such as objects,\nactions, emotion, etc.-to create semantically rich video representations. We\nshow that our system, without joint training, achieves better or comparable\nresults to state-of-the-art models and commercial solutions on multiple\ntext-to-video retrieval benchmarks. Our ablation studies highlight the benefits\nof leveraging multiple modalities for enhanced video retrieval accuracy instead\nof using a vision-language model alone. Furthermore, we show how video\nretrieval systems such as ContextIQ can be used for contextual advertising in\nan ad ecosystem while also addressing concerns related to brand safety and\nfiltering inappropriate content.\n","authors":["Ashutosh Chaubey","Anoubhav Agarwaal","Sartaki Sinha Roy","Aayush Agarwal","Susmita Ghose"],"pdf_url":"https://arxiv.org/pdf/2410.22233v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2410.22182v1","updated":"2024-10-29T16:19:08Z","published":"2024-10-29T16:19:08Z","title":"Synthetic Data Generation with Large Language Models for Personalized\n Community Question Answering","summary":" Personalization in Information Retrieval (IR) is a topic studied by the\nresearch community since a long time. However, there is still a lack of\ndatasets to conduct large-scale evaluations of personalized IR; this is mainly\ndue to the fact that collecting and curating high-quality user-related\ninformation requires significant costs and time investment. Furthermore, the\ncreation of datasets for Personalized IR (PIR) tasks is affected by both\nprivacy concerns and the need for accurate user-related data, which are often\nnot publicly available. Recently, researchers have started to explore the use\nof Large Language Models (LLMs) to generate synthetic datasets, which is a\npossible solution to generate data for low-resource tasks. In this paper, we\ninvestigate the potential of Large Language Models (LLMs) for generating\nsynthetic documents to train an IR system for a Personalized Community Question\nAnswering task. To study the effectiveness of IR models fine-tuned on\nLLM-generated data, we introduce a new dataset, named Sy-SE-PQA. We build\nSy-SE-PQA based on an existing dataset, SE-PQA, which consists of questions and\nanswers posted on the popular StackExchange communities. Starting from\nquestions in SE-PQA, we generate synthetic answers using different prompt\ntechniques and LLMs. Our findings suggest that LLMs have high potential in\ngenerating data tailored to users' needs. The synthetic data can replace\nhuman-written training data, even if the generated data may contain incorrect\ninformation.\n","authors":["Marco Braga","Pranav Kasela","Alessandro Raganato","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2410.22182v1.pdf","comment":"Accepted in WI-IAT '24"},{"id":"http://arxiv.org/abs/2410.22136v1","updated":"2024-10-29T15:32:36Z","published":"2024-10-29T15:32:36Z","title":"SimRec: Mitigating the Cold-Start Problem in Sequential Recommendation\n by Integrating Item Similarity","summary":" Sequential recommendation systems often struggle to make predictions or take\naction when dealing with cold-start items that have limited amount of\ninteractions. In this work, we propose SimRec - a new approach to mitigate the\ncold-start problem in sequential recommendation systems. SimRec addresses this\nchallenge by leveraging the inherent similarity among items, incorporating item\nsimilarities into the training process through a customized loss function.\nImportantly, this enhancement is attained with identical model architecture and\nthe same amount of trainable parameters, resulting in the same inference time\nand requiring minimal additional effort. This novel approach results in a\nrobust contextual sequential recommendation model capable of effectively\nhandling rare items, including those that were not explicitly seen during\ntraining, thereby enhancing overall recommendation performance. Rigorous\nevaluations against multiple baselines on diverse datasets showcase SimRec's\nsuperiority, particularly in scenarios involving items occurring less than 10\ntimes in the training data. The experiments reveal an impressive improvement,\nwith SimRec achieving up to 78% higher HR@10 compared to SASRec. Notably,\nSimRec outperforms strong baselines on sparse datasets while delivering on-par\nperformance on dense datasets. Our code is available at\nhttps://github.com/amazon-science/sequential-recommendation-using-similarity.\n","authors":["Shaked Brody","Shoval Lagziel"],"pdf_url":"https://arxiv.org/pdf/2410.22136v1.pdf","comment":"ACM RecSys 2024 Workshop on Context-Aware Recommender Systems"},{"id":"http://arxiv.org/abs/2410.22123v1","updated":"2024-10-29T15:24:27Z","published":"2024-10-29T15:24:27Z","title":"Testing Identity of Distributions under Kolmogorov Distance in\n Polylogarithmic Space","summary":" Suppose we have a sample from a distribution $D$ and we want to test whether\n$D = D^*$ for a fixed distribution $D^*$. Specifically, we want to reject with\nconstant probability, if the distance of $D$ from $D^*$ is $\\geq \\varepsilon$\nin a given metric. In the case of continuous distributions, this has been\nstudied thoroughly in the statistics literature. Namely, for the well-studied\nKolmogorov metric a test is known that uses the optimal $O(1/\\varepsilon^2)$\nsamples.\n However, this test naively uses also space $O(1/\\varepsilon^2)$, and previous\nwork improved this to $O(1/\\varepsilon)$. In this paper, we show that much less\nspace suffices -- we give an algorithm that uses space $O(\\log^4\n\\varepsilon^{-1})$ in the streaming setting while also using an asymptotically\noptimal number of samples. This is in contrast with the standard total\nvariation distance on discrete distributions for which such space reduction is\nknown to be impossible. Finally, we state 9 related open problems that we hope\nwill spark interest in this and related problems.\n","authors":["Christian Janos Lebeda","Jakub Tětek"],"pdf_url":"https://arxiv.org/pdf/2410.22123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09713v2","updated":"2024-10-29T13:19:12Z","published":"2024-10-13T03:45:24Z","title":"Agentic Information Retrieval","summary":" What will information entry look like in the next generation of digital\nproducts? Since the 1970s, user access to relevant information has relied on\ndomain-specific architectures of information retrieval (IR). Over the past two\ndecades, the advent of modern IR systems, including web search engines and\npersonalized recommender systems, has greatly improved the efficiency of\nretrieving relevant information from vast data corpora. However, the core\nparadigm of these IR systems remains largely unchanged, relying on filtering a\npredefined set of candidate items. Since 2022, breakthroughs in large language\nmodels (LLMs) have begun transforming how information is accessed, establishing\na new technical paradigm. In this position paper, we introduce Agentic\nInformation Retrieval (Agentic IR), a novel IR paradigm shaped by the\ncapabilities of LLM agents. Agentic IR expands the scope of accessible tasks\nand leverages a suite of new techniques to redefine information retrieval. We\ndiscuss three types of cutting-edge applications of agentic IR and the\nchallenges faced. We propose that agentic IR holds promise for generating\ninnovative applications, potentially becoming a central information entry point\nin future digital ecosystems.\n","authors":["Weinan Zhang","Junwei Liao","Ning Li","Kounianhua Du"],"pdf_url":"https://arxiv.org/pdf/2410.09713v2.pdf","comment":"11 pages, position paper"},{"id":"http://arxiv.org/abs/2409.02856v2","updated":"2024-10-29T13:02:50Z","published":"2024-09-04T16:29:25Z","title":"Building a Scalable, Effective, and Steerable Search and Ranking\n Platform","summary":" Modern e-commerce platforms offer vast product selections, making it\ndifficult for customers to find items that they like and that are relevant to\ntheir current session intent. This is why it is key for e-commerce platforms to\nhave near real-time scalable and adaptable personalized ranking and search\nsystems. While numerous methods exist in the scientific literature for building\nsuch systems, many are unsuitable for large-scale industrial use due to\ncomplexity and performance limitations. Consequently, industrial ranking\nsystems often resort to computationally efficient yet simplistic retrieval or\ncandidate generation approaches, which overlook near real-time and\nheterogeneous customer signals, which results in a less personalized and\nrelevant experience. Moreover, related customer experiences are served by\ncompletely different systems, which increases complexity, maintenance, and\ninconsistent experiences.\n In this paper, we present a personalized, adaptable near real-time ranking\nplatform that is reusable across various use cases, such as browsing and\nsearch, and that is able to cater to millions of items and customers under\nheavy load (thousands of requests per second). We employ transformer-based\nmodels through different ranking layers which can learn complex behavior\npatterns directly from customer action sequences while being able to\nincorporate temporal (e.g. in-session) and contextual information. We validate\nour system through a series of comprehensive offline and online real-world\nexperiments at a large online e-commerce platform, and we demonstrate its\nsuperiority when compared to existing systems, both in terms of customer\nexperience as well as in net revenue. Finally, we share the lessons learned\nfrom building a comprehensive, modern ranking platform for use in a large-scale\ne-commerce environment.\n","authors":["Marjan Celikik","Jacek Wasilewski","Ana Peleteiro Ramallo","Alexey Kurennoy","Evgeny Labzin","Danilo Ascione","Tural Gurbanov","Géraud Le Falher","Andrii Dzhoha","Ian Harris"],"pdf_url":"https://arxiv.org/pdf/2409.02856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22013v1","updated":"2024-10-29T13:02:11Z","published":"2024-10-29T13:02:11Z","title":"Modeling Temporal Positive and Negative Excitation for Sequential\n Recommendation","summary":" Sequential recommendation aims to predict the next item which interests users\nvia modeling their interest in items over time. Most of the existing works on\nsequential recommendation model users' dynamic interest in specific items while\noverlooking users' static interest revealed by some static attribute\ninformation of items, e.g., category, or brand. Moreover, existing works often\nonly consider the positive excitation of a user's historical interactions on\nhis/her next choice on candidate items while ignoring the commonly existing\nnegative excitation, resulting in insufficient modeling dynamic interest. The\noverlook of static interest and negative excitation will lead to incomplete\ninterest modeling and thus impede the recommendation performance. To this end,\nin this paper, we propose modeling both static interest and negative excitation\nfor dynamic interest to further improve the recommendation performance.\nAccordingly, we design a novel Static-Dynamic Interest Learning (SDIL)\nframework featured with a novel Temporal Positive and Negative Excitation\nModeling (TPNE) module for accurate sequential recommendation. TPNE is\nspecially designed for comprehensively modeling dynamic interest based on\ntemporal positive and negative excitation learning. Extensive experiments on\nthree real-world datasets show that SDIL can effectively capture both static\nand dynamic interest and outperforms state-of-the-art baselines.\n","authors":["Chengkai Huang","Shoujin Wang","Xianzhi Wang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2410.22013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21967v1","updated":"2024-10-29T11:51:06Z","published":"2024-10-29T11:51:06Z","title":"Dual Conditional Diffusion Models for Sequential Recommendation","summary":" Recent advancements in diffusion models have shown promising results in\nsequential recommendation (SR). However, current diffusion-based methods still\nexhibit two key limitations. First, they implicitly model the diffusion process\nfor target item embeddings rather than the discrete target item itself, leading\nto inconsistency in the recommendation process. Second, existing methods rely\non either implicit or explicit conditional diffusion models, limiting their\nability to fully capture the context of user behavior and leading to less\nrobust target item embeddings. In this paper, we propose the Dual Conditional\nDiffusion Models for Sequential Recommendation (DCRec), introducing a\ndiscrete-to-continuous sequential recommendation diffusion framework. Our\nframework introduces a complete Markov chain to model the transition from the\nreversed target item representation to the discrete item index, bridging the\ndiscrete and continuous item spaces for diffusion models and ensuring\nconsistency with the diffusion framework. Building on this framework, we\npresent the Dual Conditional Diffusion Transformer (DCDT) that incorporates the\nimplicit conditional and the explicit conditional for diffusion-based SR.\nExtensive experiments on public benchmark datasets demonstrate that DCRec\noutperforms state-of-the-art methods.\n","authors":["Hongtao Huang","Chengkai Huang","Xiaojun Chang","Wen Hu","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2410.21967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21892v1","updated":"2024-10-29T09:36:59Z","published":"2024-10-29T09:36:59Z","title":"Guided Diffusion-based Counterfactual Augmentation for Robust\n Session-based Recommendation","summary":" Session-based recommendation (SR) models aim to recommend top-K items to a\nuser, based on the user's behaviour during the current session. Several SR\nmodels are proposed in the literature, however,concerns have been raised about\ntheir susceptibility to inherent biases in the training data (observed data)\nsuch as popularity bias. SR models when trained on the biased training data may\nencounter performance challenges on out-of-distribution data in real-world\nscenarios. One way to mitigate popularity bias is counterfactual data\naugmentation. Compared to prior works that rely on generating data using SR\nmodels, we focus on utilizing the capabilities of state-of-the art diffusion\nmodels for generating counterfactual data. We propose a guided diffusion-based\ncounterfactual augmentation framework for SR. Through a combination of offline\nand online experiments on a real-world and simulated dataset, respectively, we\nshow that our approach performs significantly better than the baseline SR\nmodels and other state-of-the art augmentation frameworks. More importantly,\nour framework shows significant improvement on less popular target items, by\nachieving up to 20% gain in Recall and 13% gain in CTR on real-world and\nsimulated datasets,respectively.\n","authors":["Muskan Gupta","Priyanka Gupta","Lovekesh Vig"],"pdf_url":"https://arxiv.org/pdf/2410.21892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09142v2","updated":"2024-10-29T09:13:49Z","published":"2024-03-14T07:40:54Z","title":"USimAgent: Large Language Models for Simulating Search Users","summary":" Due to the advantages in the cost-efficiency and reproducibility, user\nsimulation has become a promising solution to the user-centric evaluation of\ninformation retrieval systems. Nonetheless, accurately simulating user search\nbehaviors has long been a challenge, because users' actions in search are\nhighly complex and driven by intricate cognitive processes such as learning,\nreasoning, and planning. Recently, Large Language Models (LLMs) have\ndemonstrated remarked potential in simulating human-level intelligence and have\nbeen used in building autonomous agents for various tasks. However, the\npotential of using LLMs in simulating search behaviors has not yet been fully\nexplored. In this paper, we introduce a LLM-based user search behavior\nsimulator, USimAgent. The proposed simulator can simulate users' querying,\nclicking, and stopping behaviors during search, and thus, is capable of\ngenerating complete search sessions for specific search tasks. Empirical\ninvestigation on a real user behavior dataset shows that the proposed simulator\noutperforms existing methods in query generation and is comparable to\ntraditional methods in predicting user clicks and stopping behaviors. These\nresults not only validate the effectiveness of using LLMs for user simulation\nbut also shed light on the development of a more robust and generic user\nsimulators. The code and data are accessible at\nhttps://github.com/Meow-E/USimAgent.\n","authors":["Erhan Zhang","Xingzhu Wang","Peiyuan Gong","Yankai Lin","Jiaxin Mao"],"pdf_url":"https://arxiv.org/pdf/2403.09142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21876v1","updated":"2024-10-29T09:11:28Z","published":"2024-10-29T09:11:28Z","title":"Application of Audio Fingerprinting Techniques for Real-Time Scalable\n Speech Retrieval and Speech Clusterization","summary":" Audio fingerprinting techniques have seen great advances in recent years,\nenabling accurate and fast audio retrieval even in conditions when the queried\naudio sample has been highly deteriorated or recorded in noisy conditions.\nExpectedly, most of the existing work is centered around music, with popular\nmusic identification services such as Apple's Shazam or Google's Now Playing\ndesigned for individual audio recognition on mobile devices. However, the\nspectral content of speech differs from that of music, necessitating\nmodifications to current audio fingerprinting approaches. This paper offers\nfresh insights into adapting existing techniques to address the specialized\nchallenge of speech retrieval in telecommunications and cloud communications\nplatforms. The focus is on achieving rapid and accurate audio retrieval in\nbatch processing instead of facilitating single requests, typically on a\ncentralized server. Moreover, the paper demonstrates how this approach can be\nutilized to support audio clustering based on speech transcripts without\nundergoing actual speech-to-text conversion. This optimization enables\nsignificantly faster processing without the need for GPU computing, a\nrequirement for real-time operation that is typically associated with\nstate-of-the-art speech-to-text tools.\n","authors":["Kemal Altwlkany","Sead Delalić","Adis Alihodžić","Elmedin Selmanović","Damir Hasić"],"pdf_url":"https://arxiv.org/pdf/2410.21876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21801v1","updated":"2024-10-29T07:13:47Z","published":"2024-10-29T07:13:47Z","title":"PerSRV: Personalized Sticker Retrieval with Vision-Language Model","summary":" Instant Messaging is a popular means for daily communication, allowing users\nto send text and stickers. As the saying goes, \"a picture is worth a thousand\nwords\", so developing an effective sticker retrieval technique is crucial for\nenhancing user experience. However, existing sticker retrieval methods rely on\nlabeled data to interpret stickers, and general-purpose Vision-Language Models\n(VLMs) often struggle to capture the unique semantics of stickers.\nAdditionally, relevant-based sticker retrieval methods lack personalization,\ncreating a gap between diverse user expectations and retrieval results. To\naddress these, we propose the Personalized Sticker Retrieval with\nVision-Language Model framework, namely PerSRV, structured into offline\ncalculations and online processing modules. The online retrieval part follows\nthe paradigm of relevant recall and personalized ranking, supported by the\noffline pre-calculation parts, which are sticker semantic understanding,\nutility evaluation and personalization modules. Firstly, for sticker-level\nsemantic understanding, we supervised fine-tuned LLaVA-1.5-7B to generate\nhuman-like sticker semantics, complemented by textual content extracted from\nfigures and historical interaction queries. Secondly, we investigate three\ncrowd-sourcing metrics for sticker utility evaluation. Thirdly, we cluster\nstyle centroids based on users' historical interactions to achieve personal\npreference modeling. Finally, we evaluate our proposed PerSRV method on a\npublic sticker retrieval dataset from WeChat, containing 543,098 candidates and\n12,568 interactions. Experimental results show that PerSRV significantly\noutperforms existing methods in multi-modal sticker retrieval. Additionally,\nour fine-tuned VLM delivers notable improvements in sticker semantic\nunderstandings.\n","authors":["Heng Er Metilda Chee","Jiayin Wang","Zhiqiang Guo","Weizhi Ma","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21745v1","updated":"2024-10-29T05:18:34Z","published":"2024-10-29T05:18:34Z","title":"A Dual Adaptive Assignment Approach for Robust Graph-Based Clustering","summary":" Graph clustering is an essential aspect of network analysis that involves\ngrouping nodes into separate clusters. Recent developments in deep learning\nhave resulted in advanced deep graph clustering techniques, which have proven\neffective in many applications. Nonetheless, these methods often encounter\ndifficulties when dealing with the complexities of real-world graphs,\nparticularly in the presence of noisy edges. Additionally, many denoising graph\nclustering strategies tend to suffer from lower performance compared to their\nnon-denoised counterparts, training instability, and challenges in scaling to\nlarge datasets. To tackle these issues, we introduce a new framework called the\nDual Adaptive Assignment Approach for Robust Graph-Based Clustering (RDSA).\nRDSA consists of three key components: (i) a node embedding module that\neffectively integrates the graph's topological features and node attributes;\n(ii) a structure-based soft assignment module that improves graph modularity by\nutilizing an affinity matrix for node assignments; and (iii) a node-based soft\nassignment module that identifies community landmarks and refines node\nassignments to enhance the model's robustness. We assess RDSA on various\nreal-world datasets, demonstrating its superior performance relative to\nexisting state-of-the-art methods. Our findings indicate that RDSA provides\nrobust clustering across different graph types, excelling in clustering\neffectiveness and robustness, including adaptability to noise, stability, and\nscalability.\n","authors":["Yang Xiang","Li Fan","Tulika Saha","Yushan Pan","Haiyang Zhang","Chengtao Ji"],"pdf_url":"https://arxiv.org/pdf/2410.21745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17276v2","updated":"2024-10-29T04:32:17Z","published":"2024-10-08T00:23:17Z","title":"Evaluating Performance and Bias of Negative Sampling in Large-Scale\n Sequential Recommendation Models","summary":" Large-scale industrial recommendation models predict the most relevant items\nfrom catalogs containing millions or billions of options. To train these models\nefficiently, a small set of irrelevant items (negative samples) is selected\nfrom the vast catalog for each relevant item (positive example), helping the\nmodel distinguish between relevant and irrelevant items. Choosing the right\nnegative sampling method is a common challenge. We address this by implementing\nand comparing various negative sampling methods - random, popularity-based,\nin-batch, mixed, adaptive, and adaptive with mixed variants - on modern\nsequential recommendation models. Our experiments, including hyperparameter\noptimization and 20x repeats on three benchmark datasets with varying\npopularity biases, show how the choice of method and dataset characteristics\nimpact key model performance metrics. We also reveal that average performance\nmetrics often hide imbalances across popularity bands (head, mid, tail). We\nfind that commonly used random negative sampling reinforces popularity bias and\nperforms best for head items. Popularity-based methods (in-batch and global\npopularity negative sampling) can offer balanced performance at the cost of\nlower overall model performance results. Our study serves as a practical guide\nto the trade-offs in selecting a negative sampling method for large-scale\nsequential recommendation models. Code, datasets, experimental results and\nhyperparameters are available at:\nhttps://github.com/apple/ml-negative-sampling.\n","authors":["Arushi Prakash","Dimitrios Bermperidis","Srivas Chennu"],"pdf_url":"https://arxiv.org/pdf/2410.17276v2.pdf","comment":"Workshop for Large Recommender Systems (LargeRecSys), 18th ACM\n Conference on Recommender Systems, 2024, Bari, Italy"},{"id":"http://arxiv.org/abs/2311.10764v2","updated":"2024-10-29T02:56:26Z","published":"2023-11-15T06:36:11Z","title":"Deep Group Interest Modeling of Full Lifelong User Behaviors for CTR\n Prediction","summary":" Extracting users' interests from their lifelong behavior sequence is crucial\nfor predicting Click-Through Rate (CTR). Most current methods employ a\ntwo-stage process for efficiency: they first select historical behaviors\nrelated to the candidate item and then deduce the user's interest from this\nnarrowed-down behavior sub-sequence. This two-stage paradigm, though effective,\nleads to information loss. Solely using users' lifelong click behaviors doesn't\nprovide a complete picture of their interests, leading to suboptimal\nperformance. In our research, we introduce the Deep Group Interest Network\n(DGIN), an end-to-end method to model the user's entire behavior history. This\nincludes all post-registration actions, such as clicks, cart additions,\npurchases, and more, providing a nuanced user understanding. We start by\ngrouping the full range of behaviors using a relevant key (like item_id) to\nenhance efficiency. This process reduces the behavior length significantly,\nfrom O(10^4) to O(10^2). To mitigate the potential loss of information due to\ngrouping, we incorporate two categories of group attributes. Within each group,\nwe calculate statistical information on various heterogeneous behaviors (like\nbehavior counts) and employ self-attention mechanisms to highlight unique\nbehavior characteristics (like behavior type). Based on this reorganized\nbehavior data, the user's interests are derived using the Transformer\ntechnique. Additionally, we identify a subset of behaviors that share the same\nitem_id with the candidate item from the lifelong behavior sequence. The\ninsights from this subset reveal the user's decision-making process related to\nthe candidate item, improving prediction accuracy. Our comprehensive\nevaluation, both on industrial and public datasets, validates DGIN's efficacy\nand efficiency.\n","authors":["Qi Liu","Xuyang Hou","Haoran Jin","Xiaolong Chen","Jin Chen","Defu Lian","Zhe Wang","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2311.10764v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.21144v2","updated":"2024-10-29T16:25:34Z","published":"2024-10-28T15:44:35Z","title":"Enhancing Learned Image Compression via Cross Window-based Attention","summary":" In recent years, learned image compression methods have demonstrated superior\nrate-distortion performance compared to traditional image compression methods.\nRecent methods utilize convolutional neural networks (CNN), variational\nautoencoders (VAE), invertible neural networks (INN), and transformers. Despite\ntheir significant contributions, a main drawback of these models is their poor\nperformance in capturing local redundancy. Therefore, to leverage global\nfeatures along with local redundancy, we propose a CNN-based solution\nintegrated with a feature encoding module. The feature encoding module encodes\nimportant features before feeding them to the CNN and then utilizes cross-scale\nwindow-based attention, which further captures local redundancy. Cross-scale\nwindow-based attention is inspired by the attention mechanism in transformers\nand effectively enlarges the receptive field. Both the feature encoding module\nand the cross-scale window-based attention module in our architecture are\nflexible and can be incorporated into any other network architecture. We\nevaluate our method on the Kodak and CLIC datasets and demonstrate that our\napproach is effective and on par with state-of-the-art methods.\n","authors":["Priyanka Mudgal","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.21144v2.pdf","comment":"Paper accepted and presented in ISVC'24. Copyrights stay with ISVC"},{"id":"http://arxiv.org/abs/2410.22112v1","updated":"2024-10-29T15:11:45Z","published":"2024-10-29T15:11:45Z","title":"Multimodal Semantic Communication for Generative Audio-Driven Video\n Conferencing","summary":" This paper studies an efficient multimodal data communication scheme for\nvideo conferencing. In our considered system, a speaker gives a talk to the\naudiences, with talking head video and audio being transmitted. Since the\nspeaker does not frequently change posture and high-fidelity transmission of\naudio (speech and music) is required, redundant visual video data exists and\ncan be removed by generating the video from the audio. To this end, we propose\na wave-to-video (Wav2Vid) system, an efficient video transmission framework\nthat reduces transmitted data by generating talking head video from audio. In\nparticular, full-duration audio and short-duration video data are synchronously\ntransmitted through a wireless channel, with neural networks (NNs) extracting\nand encoding audio and video semantics. The receiver then combines the decoded\naudio and video data, as well as uses a generative adversarial network (GAN)\nbased model to generate the lip movement videos of the speaker. Simulation\nresults show that the proposed Wav2Vid system can reduce the amount of\ntransmitted data by up to 83% while maintaining the perceptual quality of the\ngenerated conferencing video.\n","authors":["Haonan Tong","Haopeng Li","Hongyang Du","Zhaohui Yang","Changchuan Yin","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2410.22112v1.pdf","comment":"accepted by IEEE Wireless Communications Letters"},{"id":"http://arxiv.org/abs/2310.16334v2","updated":"2024-10-29T14:53:47Z","published":"2023-10-25T03:30:37Z","title":"Structured Multi-Track Accompaniment Arrangement via Style Prior\n Modelling","summary":" In the realm of music AI, arranging rich and structured multi-track\naccompaniments from a simple lead sheet presents significant challenges. Such\nchallenges include maintaining track cohesion, ensuring long-term coherence,\nand optimizing computational efficiency. In this paper, we introduce a novel\nsystem that leverages prior modelling over disentangled style factors to\naddress these challenges. Our method presents a two-stage process: initially, a\npiano arrangement is derived from the lead sheet by retrieving piano texture\nstyles; subsequently, a multi-track orchestration is generated by infusing\norchestral function styles into the piano arrangement. Our key design is the\nuse of vector quantization and a unique multi-stream Transformer to model the\nlong-term flow of the orchestration style, which enables flexible,\ncontrollable, and structured music generation. Experiments show that by\nfactorizing the arrangement task into interpretable sub-stages, our approach\nenhances generative capacity while improving efficiency. Additionally, our\nsystem supports a variety of music genres and provides style control at\ndifferent composition hierarchies. We further show that our system achieves\nsuperior coherence, structure, and overall arrangement quality compared to\nexisting baselines.\n","authors":["Jingwei Zhao","Gus Xia","Ziyu Wang","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.16334v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.22046v1","updated":"2024-10-29T13:53:09Z","published":"2024-10-29T13:53:09Z","title":"CHORDONOMICON: A Dataset of 666,000 Songs and their Chord Progressions","summary":" Chord progressions encapsulate important information about music, pertaining\nto its structure and conveyed emotions. They serve as the backbone of musical\ncomposition, and in many cases, they are the sole information required for a\nmusician to play along and follow the music. Despite their importance, chord\nprogressions as a data domain remain underexplored. There is a lack of\nlarge-scale datasets suitable for deep learning applications, and limited\nresearch exploring chord progressions as an input modality. In this work, we\npresent Chordonomicon, a dataset of over 666,000 songs and their chord\nprogressions, annotated with structural parts, genre, and release date -\ncreated by scraping various sources of user-generated progressions and\nassociated metadata. We demonstrate the practical utility of the Chordonomicon\ndataset for classification and generation tasks, and discuss its potential to\nprovide valuable insights to the research community. Chord progressions are\nunique in their ability to be represented in multiple formats (e.g. text,\ngraph) and the wealth of information chords convey in given contexts, such as\ntheir harmonic function . These characteristics make the Chordonomicon an ideal\ntestbed for exploring advanced machine learning techniques, including\ntransformers, graph machine learning, and hybrid systems that combine knowledge\nrepresentation and machine learning.\n","authors":["Spyridon Kantarelis","Konstantinos Thomas","Vassilis Lyberatos","Edmund Dervakos","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2410.22046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11832v2","updated":"2024-10-29T06:44:36Z","published":"2024-06-17T17:59:44Z","title":"Unveiling Encoder-Free Vision-Language Models","summary":" Existing vision-language models (VLMs) mostly rely on vision encoders to\nextract visual features followed by large language models (LLMs) for\nvisual-language tasks. However, the vision encoders set a strong inductive bias\nin abstracting visual representation, e.g., resolution, aspect ratio, and\nsemantic priors, which could impede the flexibility and efficiency of the VLMs.\nTraining pure VLMs that accept the seamless vision and language inputs, i.e.,\nwithout vision encoders, remains challenging and rarely explored. Empirical\nobservations reveal that direct training without encoders results in slow\nconvergence and large performance gaps. In this work, we bridge the gap between\nencoder-based and encoder-free models, and present a simple yet effective\ntraining recipe towards pure VLMs. Specifically, we unveil the key aspects of\ntraining encoder-free VLMs efficiently via thorough experiments: (1) Bridging\nvision-language representation inside one unified decoder; (2) Enhancing visual\nrecognition capability via extra supervision. With these strategies, we launch\nEVE, an encoder-free vision-language model that can be trained and forwarded\nefficiently. Notably, solely utilizing 35M publicly accessible data, EVE can\nimpressively rival the encoder-based VLMs of similar capacities across multiple\nvision-language benchmarks. It significantly outperforms the counterpart\nFuyu-8B with mysterious training procedures and undisclosed training data. We\nbelieve that EVE provides a transparent and efficient route for developing a\npure decoder-only architecture across modalities. Our code and models are\npublicly available at: https://github.com/baaivision/EVE.\n","authors":["Haiwen Diao","Yufeng Cui","Xiaotong Li","Yueze Wang","Huchuan Lu","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.11832v2.pdf","comment":"17 pages, 8 figures, Accepted by NeurIPS2024 (spotlight)"},{"id":"http://arxiv.org/abs/2410.21169v2","updated":"2024-10-29T06:32:24Z","published":"2024-10-28T16:11:35Z","title":"Document Parsing Unveiled: Techniques, Challenges, and Prospects for\n Structured Information Extraction","summary":" Document parsing is essential for converting unstructured and semi-structured\ndocuments-such as contracts, academic papers, and invoices-into structured,\nmachine-readable data. Document parsing extract reliable structured data from\nunstructured inputs, providing huge convenience for numerous applications.\nEspecially with recent achievements in Large Language Models, document parsing\nplays an indispensable role in both knowledge base construction and training\ndata generation. This survey presents a comprehensive review of the current\nstate of document parsing, covering key methodologies, from modular pipeline\nsystems to end-to-end models driven by large vision-language models. Core\ncomponents such as layout detection, content extraction (including text,\ntables, and mathematical expressions), and multi-modal data integration are\nexamined in detail. Additionally, this paper discusses the challenges faced by\nmodular document parsing systems and vision-language models in handling complex\nlayouts, integrating multiple modules, and recognizing high-density text. It\nemphasizes the importance of developing larger and more diverse datasets and\noutlines future research directions.\n","authors":["Qintong Zhang","Victor Shea-Jay Huang","Bin Wang","Junyuan Zhang","Zhengren Wang","Hao Liang","Shawn Wang","Matthieu Lin","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21169v2.pdf","comment":null}]},"2024-10-28T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.08660v2","updated":"2024-10-28T22:29:03Z","published":"2024-03-27T18:53:04Z","title":"How Does Message Passing Improve Collaborative Filtering?","summary":" Collaborative filtering (CF) has exhibited prominent results for recommender\nsystems and been broadly utilized for real-world applications. A branch of\nresearch enhances CF methods by message passing used in graph neural networks,\ndue to its strong capabilities of extracting knowledge from graph-structured\ndata, like user-item bipartite graphs that naturally exist in CF. They assume\nthat message passing helps CF methods in a manner akin to its benefits for\ngraph-based learning tasks in general. However, even though message passing\nempirically improves CF, whether or not this assumption is correct still needs\nverification. To address this gap, we formally investigate why message passing\nhelps CF from multiple perspectives and show that many assumptions made by\nprevious works are not entirely accurate. With our curated ablation studies and\ntheoretical analyses, we discover that (1) message passing improves the CF\nperformance primarily by additional representations passed from neighbors\nduring the forward pass instead of additional gradient updates to neighbor\nrepresentations during the model back-propagation and (ii) message passing\nusually helps low-degree nodes more than high-degree nodes. Utilizing these\nnovel findings, we present Test-time Aggregation for CF, namely TAG-CF, a\ntest-time augmentation framework that only conducts message passing once at\ninference time. The key novelty of TAG-CF is that it effectively utilizes graph\nknowledge while circumventing most of notorious computational overheads of\nmessage passing. Besides, TAG-CF is extremely versatile can be used as a\nplug-and-play module to enhance representations trained by different CF\nsupervision signals. Evaluated on six datasets, TAG-CF consistently improves\nthe recommendation performance of CF methods without graph by up to 39.2% on\ncold users and 31.7% on all users, with little to no extra computational\noverheads.\n","authors":["Mingxuan Ju","William Shiao","Zhichun Guo","Yanfang Ye","Yozen Liu","Neil Shah","Tong Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.08660v2.pdf","comment":"Accepted to NeurIPS'24. Code available at:\n https://github.com/snap-research/Test-time-Aggregation-for-CF"},{"id":"http://arxiv.org/abs/2410.21549v1","updated":"2024-10-28T21:25:38Z","published":"2024-10-28T21:25:38Z","title":"Semantic Search Evaluation","summary":" We propose a novel method for evaluating the performance of a content search\nsystem that measures the semantic match between a query and the results\nreturned by the search system. We introduce a metric called \"on-topic rate\" to\nmeasure the percentage of results that are relevant to the query. To achieve\nthis, we design a pipeline that defines a golden query set, retrieves the top K\nresults for each query, and sends calls to GPT 3.5 with formulated prompts. Our\nsemantic evaluation pipeline helps identify common failure patterns and goals\nagainst the metric for relevance improvements.\n","authors":["Chujie Zheng","Jeffrey Wang","Shuqian Albee Zhang","Anand Kishore","Siddharth Singh"],"pdf_url":"https://arxiv.org/pdf/2410.21549v1.pdf","comment":"Accepted by 3rd International Workshop on Industrial Recommendation\n Systems (at CIKM 2024)"},{"id":"http://arxiv.org/abs/2410.21529v1","updated":"2024-10-28T20:55:00Z","published":"2024-10-28T20:55:00Z","title":"Can Users Detect Biases or Factual Errors in Generated Responses in\n Conversational Information-Seeking?","summary":" Information-seeking dialogues span a wide range of questions, from simple\nfactoid to complex queries that require exploring multiple facets and\nviewpoints. When performing exploratory searches in unfamiliar domains, users\nmay lack background knowledge and struggle to verify the system-provided\ninformation, making them vulnerable to misinformation. We investigate the\nlimitations of response generation in conversational information-seeking\nsystems, highlighting potential inaccuracies, pitfalls, and biases in the\nresponses. The study addresses the problem of query answerability and the\nchallenge of response incompleteness. Our user studies explore how these issues\nimpact user experience, focusing on users' ability to identify biased,\nincorrect, or incomplete responses. We design two crowdsourcing tasks to assess\nuser experience with different system response variants, highlighting critical\nissues to be addressed in future conversational information-seeking research.\nOur analysis reveals that it is easier for users to detect response\nincompleteness than query answerability and user satisfaction is mostly\nassociated with response diversity, not factual correctness.\n","authors":["Weronika Łajewska","Krisztian Balog","Damiano Spina","Johanne Trippas"],"pdf_url":"https://arxiv.org/pdf/2410.21529v1.pdf","comment":"Extended version of the paper that appeared in the Proceedings of the\n 2024 Annual International ACM SIGIR Conference on Research and Development in\n Information Retrieval in the Asia Pacific Region (SIGIR-AP '24)"},{"id":"http://arxiv.org/abs/2410.21487v1","updated":"2024-10-28T19:52:09Z","published":"2024-10-28T19:52:09Z","title":"Enhancing CTR Prediction in Recommendation Domain with Search Query\n Representation","summary":" Many platforms, such as e-commerce websites, offer both search and\nrecommendation services simultaneously to better meet users' diverse needs.\nRecommendation services suggest items based on user preferences, while search\nservices allow users to search for items before providing recommendations.\nSince users and items are often shared between the search and recommendation\ndomains, there is a valuable opportunity to enhance the recommendation domain\nby leveraging user preferences extracted from the search domain. Existing\napproaches either overlook the shift in user intention between these domains or\nfail to capture the significant impact of learning from users' search queries\non understanding their interests.\n In this paper, we propose a framework that learns from user search query\nembeddings within the context of user preferences in the recommendation domain.\nSpecifically, user search query sequences from the search domain are used to\npredict the items users will click at the next time point in the recommendation\ndomain. Additionally, the relationship between queries and items is explored\nthrough contrastive learning. To address issues of data sparsity, the diffusion\nmodel is incorporated to infer positive items the user will select after\nsearching with certain queries in a denoising manner, which is particularly\neffective in preventing false positives. Effectively extracting this\ninformation, the queries are integrated into click-through rate prediction in\nthe recommendation domain. Experimental analysis demonstrates that our model\noutperforms state-of-the-art models in the recommendation domain.\n","authors":["Yuening Wang","Man Chen","Yaochen Hu","Wei Guo","Yingxue Zhang","Huifeng Guo","Yong Liu","Mark Coates"],"pdf_url":"https://arxiv.org/pdf/2410.21487v1.pdf","comment":"Accepted by CIKM 2024 Full Research Track"},{"id":"http://arxiv.org/abs/2410.21484v1","updated":"2024-10-28T19:49:53Z","published":"2024-10-28T19:49:53Z","title":"A Systematic Review of Machine Learning in Sports Betting: Techniques,\n Challenges, and Future Directions","summary":" The sports betting industry has experienced rapid growth, driven largely by\ntechnological advancements and the proliferation of online platforms. Machine\nlearning (ML) has played a pivotal role in the transformation of this sector by\nenabling more accurate predictions, dynamic odds-setting, and enhanced risk\nmanagement for both bookmakers and bettors. This systematic review explores\nvarious ML techniques, including support vector machines, random forests, and\nneural networks, as applied in different sports such as soccer, basketball,\ntennis, and cricket. These models utilize historical data, in-game statistics,\nand real-time information to optimize betting strategies and identify value\nbets, ultimately improving profitability. For bookmakers, ML facilitates\ndynamic odds adjustment and effective risk management, while bettors leverage\ndata-driven insights to exploit market inefficiencies. This review also\nunderscores the role of ML in fraud detection, where anomaly detection models\nare used to identify suspicious betting patterns. Despite these advancements,\nchallenges such as data quality, real-time decision-making, and the inherent\nunpredictability of sports outcomes remain. Ethical concerns related to\ntransparency and fairness are also of significant importance. Future research\nshould focus on developing adaptive models that integrate multimodal data and\nmanage risk in a manner akin to financial portfolios. This review provides a\ncomprehensive examination of the current applications of ML in sports betting,\nand highlights both the potential and the limitations of these technologies.\n","authors":["René Manassé Galekwa","Jean Marie Tshimula","Etienne Gael Tajeuna","Kyamakya Kyandoghere"],"pdf_url":"https://arxiv.org/pdf/2410.21484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15239v3","updated":"2024-10-28T17:52:17Z","published":"2024-07-21T18:08:44Z","title":"Assessing Brittleness of Image-Text Retrieval Benchmarks from\n Vision-Language Models Perspective","summary":" We examine the brittleness of the image-text retrieval (ITR) evaluation\npipeline with a focus on concept granularity. We start by analyzing two common\nbenchmarks, MS-COCO and Flickr30k, and compare them with augmented,\nfine-grained versions, MS-COCO-FG and Flickr30k-FG, given a specified set of\nlinguistic features capturing concept granularity. Flickr30k-FG and MS COCO-FG\nconsistently give rise to higher scores across all the selected features. To\nfurther our understanding of the impact of granularity we consider a novel\ntaxonomy of query perturbations. We apply these perturbations to the selected\ndatasets. We evaluate four diverse state-of-the-art Vision-Language models on\nboth the standard and fine-grained datasets under zero-shot conditions, with\nand without the applied perturbations. The results demonstrate that although\nperturbations generally degrade model performance, the fine-grained datasets\nexhibit a smaller performance drop than their standard counterparts. The\nrelative performance drop across all setups is consistent across all models and\ndatasets, indicating that the issue lies within the benchmarks themselves. We\nconclude by providing an agenda for improving ITR evaluation pipelines.\n","authors":["Mariya Hendriksen","Shuo Zhang","Ridho Reinanda","Mohamed Yahya","Edgar Meij","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2407.15239v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21242v1","updated":"2024-10-28T17:40:40Z","published":"2024-10-28T17:40:40Z","title":"Zero-Shot Dense Retrieval with Embeddings from Relevance Feedback","summary":" Building effective dense retrieval systems remains difficult when relevance\nsupervision is not available. Recent work has looked to overcome this challenge\nby using a Large Language Model (LLM) to generate hypothetical documents that\ncan be used to find the closest real document. However, this approach relies\nsolely on the LLM to have domain-specific knowledge relevant to the query,\nwhich may not be practical. Furthermore, generating hypothetical documents can\nbe inefficient as it requires the LLM to generate a large number of tokens for\neach query. To address these challenges, we introduce Real Document Embeddings\nfrom Relevance Feedback (ReDE-RF). Inspired by relevance feedback, ReDE-RF\nproposes to re-frame hypothetical document generation as a relevance estimation\ntask, using an LLM to select which documents should be used for nearest\nneighbor search. Through this re-framing, the LLM no longer needs\ndomain-specific knowledge but only needs to judge what is relevant.\nAdditionally, relevance estimation only requires the LLM to output a single\ntoken, thereby improving search latency. Our experiments show that ReDE-RF\nconsistently surpasses state-of-the-art zero-shot dense retrieval methods\nacross a wide range of low-resource retrieval datasets while also making\nsignificant improvements in latency per-query.\n","authors":["Nour Jedidi","Yung-Sung Chuang","Leslie Shing","James Glass"],"pdf_url":"https://arxiv.org/pdf/2410.21242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21220v1","updated":"2024-10-28T17:04:18Z","published":"2024-10-28T17:04:18Z","title":"Vision Search Assistant: Empower Vision-Language Models as Multimodal\n Search Engines","summary":" Search engines enable the retrieval of unknown information with texts.\nHowever, traditional methods fall short when it comes to understanding\nunfamiliar visual content, such as identifying an object that the model has\nnever seen before. This challenge is particularly pronounced for large\nvision-language models (VLMs): if the model has not been exposed to the object\ndepicted in an image, it struggles to generate reliable answers to the user's\nquestion regarding that image. Moreover, as new objects and events continuously\nemerge, frequently updating VLMs is impractical due to heavy computational\nburdens. To address this limitation, we propose Vision Search Assistant, a\nnovel framework that facilitates collaboration between VLMs and web agents.\nThis approach leverages VLMs' visual understanding capabilities and web agents'\nreal-time information access to perform open-world Retrieval-Augmented\nGeneration via the web. By integrating visual and textual representations\nthrough this collaboration, the model can provide informed responses even when\nthe image is novel to the system. Extensive experiments conducted on both\nopen-set and closed-set QA benchmarks demonstrate that the Vision Search\nAssistant significantly outperforms the other models and can be widely applied\nto existing VLMs.\n","authors":["Zhixin Zhang","Yiyuan Zhang","Xiaohan Ding","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2410.21220v1.pdf","comment":"Code is available at https://github.com/cnzzx/VSA"},{"id":"http://arxiv.org/abs/2410.19177v2","updated":"2024-10-28T15:51:46Z","published":"2024-10-24T22:13:48Z","title":"Sentiment-Driven Community Detection in a Network of Perfume Preferences","summary":" Network analysis is increasingly important across various fields, including\nthe fragrance industry, where perfumes are represented as nodes and shared user\npreferences as edges in perfume networks. Community detection can uncover\nclusters of similar perfumes, providing insights into consumer preferences,\nenhancing recommendation systems, and informing targeted marketing strategies.\n This study aims to apply community detection techniques to group perfumes\nfavored by users into relevant clusters for better recommendations. We\nconstructed a bipartite network from user reviews on the Persian retail\nplatform \"Atrafshan,\" with nodes representing users and perfumes, and edges\nformed by positive comments. This network was transformed into a Perfume\nCo-Preference Network, connecting perfumes liked by the same users. By applying\ncommunity detection algorithms, we identified clusters based on shared\npreferences, enhancing our understanding of user sentiment in the fragrance\nmarket.\n To improve sentiment analysis, we integrated emojis and a user voting system\nfor greater accuracy. Emojis, aligned with their Persian counterparts, captured\nthe emotional tone of reviews, while user ratings for scent, longevity, and\nsillage refined sentiment classification. Edge weights were adjusted by\ncombining adjacency values with user ratings in a 60:40 ratio, reflecting both\nconnection strength and user preferences. These enhancements led to improved\nmodularity of detected communities, resulting in more accurate perfume\ngroupings.\n This research pioneers the use of community detection in perfume networks,\noffering new insights into consumer preferences. Our advancements in sentiment\nanalysis and edge weight refinement provide actionable insights for optimizing\nproduct recommendations and marketing strategies in the fragrance industry.\n","authors":["Kamand Kalashi","Sajjad Saed","Babak Teimourpour"],"pdf_url":"https://arxiv.org/pdf/2410.19177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13399v2","updated":"2024-10-28T15:48:08Z","published":"2024-08-23T22:51:14Z","title":"Transforming Location Retrieval at Airbnb: A Journey from Heuristics to\n Reinforcement Learning","summary":" The Airbnb search system grapples with many unique challenges as it continues\nto evolve. We oversee a marketplace that is nuanced by geography, diversity of\nhomes, and guests with a variety of preferences. Crafting an efficient search\nsystem that can accommodate diverse guest needs, while showcasing relevant\nhomes lies at the heart of Airbnb's success. Airbnb search has many challenges\nthat parallel other recommendation and search systems but it has a unique\ninformation retrieval problem, upstream of ranking, called location retrieval.\nIt requires defining a topological map area that is relevant to the searched\nquery for homes listing retrieval. The purpose of this paper is to demonstrate\nthe methodology, challenges, and impact of building a machine learning based\nlocation retrieval product from the ground up. Despite the lack of suitable,\nprevalent machine learning based approaches, we tackle cold start,\ngeneralization, differentiation and algorithmic bias. We detail the efficacy of\nheuristics, statistics, machine learning, and reinforcement learning approaches\nto solve these challenges, particularly for systems that are often unexplored\nby current literature.\n","authors":["Dillon Davis","Huiji Gao","Thomas Legrand","Weiwei Guo","Malay Haldar","Alex Deng","Han Zhao","Liwei He","Sanjeev Katariya"],"pdf_url":"https://arxiv.org/pdf/2408.13399v2.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2410.21048v1","updated":"2024-10-28T14:03:39Z","published":"2024-10-28T14:03:39Z","title":"Pay Attention to Attention for Sequential Recommendation","summary":" Transformer-based approaches have demonstrated remarkable success in various\nsequence-based tasks. However, traditional self-attention models may not\nsufficiently capture the intricate dependencies within items in sequential\nrecommendation scenarios. This is due to the lack of explicit emphasis on\nattention weights, which play a critical role in allocating attention and\nunderstanding item-to-item correlations. To better exploit the potential of\nattention weights and improve the capability of sequential recommendation in\nlearning high-order dependencies, we propose a novel sequential recommendation\n(SR) approach called attention weight refinement (AWRSR). AWRSR enhances the\neffectiveness of self-attention by additionally paying attention to attention\nweights, allowing for more refined attention distributions of correlations\namong items. We conduct comprehensive experiments on multiple real-world\ndatasets, demonstrating that our approach consistently outperforms\nstate-of-the-art SR models. Moreover, we provide a thorough analysis of AWRSR's\neffectiveness in capturing higher-level dependencies. These findings suggest\nthat AWRSR offers a promising new direction for enhancing the performance of\nself-attention architecture in SR tasks, with potential applications in other\nsequence-based problems as well.\n","authors":["Yuli Liu","Min Liu","Xiaojing Liu"],"pdf_url":"https://arxiv.org/pdf/2410.21048v1.pdf","comment":"Accepted at RecSys 2024"},{"id":"http://arxiv.org/abs/2410.20965v1","updated":"2024-10-28T12:36:00Z","published":"2024-10-28T12:36:00Z","title":"Simultaneous Unlearning of Multiple Protected User Attributes From\n Variational Autoencoder Recommenders Using Adversarial Training","summary":" In widely used neural network-based collaborative filtering models, users'\nhistory logs are encoded into latent embeddings that represent the users'\npreferences. In this setting, the models are capable of mapping users'\nprotected attributes (e.g., gender or ethnicity) from these user embeddings\neven without explicit access to them, resulting in models that may treat\nspecific demographic user groups unfairly and raise privacy issues. While prior\nwork has approached the removal of a single protected attribute of a user at a\ntime, multiple attributes might come into play in real-world scenarios. In the\nwork at hand, we present AdvXMultVAE which aims to unlearn multiple protected\nattributes (exemplified by gender and age) simultaneously to improve fairness\nacross demographic user groups. For this purpose, we couple a variational\nautoencoder (VAE) architecture with adversarial training (AdvMultVAE) to\nsupport simultaneous removal of the users' protected attributes with continuous\nand/or categorical values. Our experiments on two datasets, LFM-2b-100k and\nMl-1m, from the music and movie domains, respectively, show that our approach\ncan yield better results than its singular removal counterparts (based on\nAdvMultVAE) in effectively mitigating demographic biases whilst improving the\nanonymity of latent embeddings.\n","authors":["Gustavo Escobedo","Christian Ganhör","Stefan Brandl","Mirjam Augstein","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2410.20965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19014v4","updated":"2024-10-28T11:11:04Z","published":"2024-09-24T01:40:50Z","title":"FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL\n Benchmark","summary":" Text-to-SQL systems have become crucial for translating natural language into\nSQL queries in various industries, enabling non-technical users to perform\ncomplex data operations. The need for accurate evaluation methods has increased\nas these systems have grown more sophisticated. However, the Execution Accuracy\n(EX), the most prevalent evaluation metric, still shows many false positives\nand negatives. Thus, this paper introduces FLEX (False-Less EXecution), a novel\napproach to evaluating text-to-SQL systems using large language models (LLMs)\nto emulate human expert-level evaluation of SQL queries. Our metric improves\nagreement with human experts (from 62 to 87.04 in Cohen's kappa) with\ncomprehensive context and sophisticated criteria. Our extensive experiments\nyield several key insights: (1) Models' performance increases by over 2.6\npoints on average, substantially affecting rankings on Spider and BIRD\nbenchmarks; (2) The underestimation of models in EX primarily stems from\nannotation quality issues; and (3) Model performance on particularly\nchallenging questions tends to be overestimated. This work contributes to a\nmore accurate and nuanced evaluation of text-to-SQL systems, potentially\nreshaping our understanding of state-of-the-art performance in this field.\n","authors":["Heegyu Kim","Taeyang Jeon","Seunghwan Choi","Seungtaek Choi","Hyunsouk Cho"],"pdf_url":"https://arxiv.org/pdf/2409.19014v4.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2410.20909v1","updated":"2024-10-28T10:39:08Z","published":"2024-10-28T10:39:08Z","title":"Challenges in Implementing a Recommender System for Historical Research\n in the Humanities","summary":" This extended abstract describes the challenges in implementing recommender\nsystems for digital archives in the humanities, focusing on Monasterium.net, a\nplatform for historical legal documents. We discuss three key aspects: (i) the\nunique characteristics of so-called charters as items for recommendation, (ii)\nthe complex multi-stakeholder environment, and (iii) the distinct\ninformation-seeking behavior of scholars in the humanities. By examining these\nfactors, we aim to contribute to the development of more effective and tailored\nrecommender systems for (digital) humanities research.\n","authors":["Florian Atzenhofer-Baumgartner","Bernhard C. Geiger","Christoph Trattner","Georg Vogeler","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2410.20909v1.pdf","comment":"Presented at AltRecSys 2024: The First Workshop on Alternative,\n Unexpected, and Critical Ideas in Recommendation, October 18, 2024,\n co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024),\n Bari, Italy"},{"id":"http://arxiv.org/abs/2410.20868v1","updated":"2024-10-28T09:36:03Z","published":"2024-10-28T09:36:03Z","title":"RecFlow: An Industrial Full Flow Recommendation Dataset","summary":" Industrial recommendation systems (RS) rely on the multi-stage pipeline to\nbalance effectiveness and efficiency when delivering items from a vast corpus\nto users. Existing RS benchmark datasets primarily focus on the exposure space,\nwhere novel RS algorithms are trained and evaluated. However, when these\nalgorithms transition to real world industrial RS, they face a critical\nchallenge of handling unexposed items which are a significantly larger space\nthan the exposed one. This discrepancy profoundly impacts their practical\nperformance. Additionally, these algorithms often overlook the intricate\ninterplay between multiple RS stages, resulting in suboptimal overall system\nperformance. To address this issue, we introduce RecFlow, an industrial full\nflow recommendation dataset designed to bridge the gap between offline RS\nbenchmarks and the real online environment. Unlike existing datasets, RecFlow\nincludes samples not only from the exposure space but also unexposed items\nfiltered at each stage of the RS funnel. Our dataset comprises 38M interactions\nfrom 42K users across nearly 9M items with additional 1.9B stage samples\ncollected from 9.3M online requests over 37 days and spanning 6 stages.\nLeveraging the RecFlow dataset, we conduct courageous exploration experiments,\nshowcasing its potential in designing new algorithms to enhance effectiveness\nby incorporating stage-specific samples. Some of these algorithms have already\nbeen deployed online, consistently yielding significant gains. We propose\nRecFlow as the first comprehensive benchmark dataset for the RS community,\nsupporting research on designing algorithms at any stage, study of selection\nbias, debiased algorithms, multi-stage consistency and optimality, multi-task\nrecommendation, and user behavior modeling. The RecFlow dataset, along with the\ncorresponding source code, is available at\nhttps://github.com/RecFlow-ICLR/RecFlow.\n","authors":["Qi Liu","Kai Zheng","Rui Huang","Wuchao Li","Kuo Cai","Yuan Chai","Yanan Niu","Yiqun Hui","Bing Han","Na Mou","Hongning Wang","Wentian Bao","Yunen Yu","Guorui Zhou","Han Li","Yang Song","Defu Lian","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2410.20868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20859v1","updated":"2024-10-28T09:21:15Z","published":"2024-10-28T09:21:15Z","title":"Leveraging AI and Sentiment Analysis for Forecasting Election Outcomes\n in Mauritius","summary":" This study explores the use of AI-driven sentiment analysis as a novel tool\nfor forecasting election outcomes, focusing on Mauritius' 2024 elections. In\nthe absence of reliable polling data, we analyze media sentiment toward two\nmain political parties L'Alliance Lepep and L'Alliance Du Changement by\nclassifying news articles from prominent Mauritian media outlets as positive,\nnegative, or neutral. We employ a multilingual BERT-based model and a custom\nSentiment Scoring Algorithm to quantify sentiment dynamics and apply the\nSentiment Impact Score (SIS) for measuring sentiment influence over time. Our\nforecast model suggests L'Alliance Du Changement is likely to secure a minimum\nof 37 seats, while L'Alliance Lepep is predicted to obtain the remaining 23\nseats out of the 60 available. Findings indicate that positive media sentiment\nstrongly correlates with projected electoral gains, underscoring the role of\nmedia in shaping public perception. This approach not only mitigates media bias\nthrough adjusted scoring but also serves as a reliable alternative to\ntraditional polling. The study offers a scalable methodology for political\nforecasting in regions with limited polling infrastructure and contributes to\nadvancements in the field of political data science.\n","authors":["Missie Chercheur","Malkenzie Bovafiz"],"pdf_url":"https://arxiv.org/pdf/2410.20859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20820v1","updated":"2024-10-28T08:11:17Z","published":"2024-10-28T08:11:17Z","title":"Temporal Streaming Batch Principal Component Analysis for Time Series\n Classification","summary":" In multivariate time series classification, although current sequence\nanalysis models have excellent classification capabilities, they show\nsignificant shortcomings when dealing with long sequence multivariate data,\nsuch as prolonged training times and decreased accuracy. This paper focuses on\noptimizing model performance for long-sequence multivariate data by mitigating\nthe impact of extended time series and multiple variables on the model. We\npropose a principal component analysis (PCA)-based temporal streaming\ncompression and dimensionality reduction algorithm for time series data\n(temporal streaming batch PCA, TSBPCA), which continuously updates the compact\nrepresentation of the entire sequence through streaming PCA time estimation\nwith time block updates, enhancing the data representation capability of a\nrange of sequence analysis models. We evaluated this method using various\nmodels on five real datasets, and the experimental results show that our method\nperforms well in terms of classification accuracy and time efficiency. Notably,\nour method demonstrates a trend of increasing effectiveness as sequence length\ngrows; on the two longest sequence datasets, accuracy improved by about 7.2%,\nand execution time decreased by 49.5%.\n","authors":["Enshuo Yan","Huachuan Wang","Weihao Xia"],"pdf_url":"https://arxiv.org/pdf/2410.20820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06941v2","updated":"2024-10-28T07:51:29Z","published":"2024-08-13T14:59:44Z","title":"OpenResearcher: Unleashing AI for Accelerated Scientific Research","summary":" The rapid growth of scientific literature imposes significant challenges for\nresearchers endeavoring to stay updated with the latest advancements in their\nfields and delve into new areas. We introduce OpenResearcher, an innovative\nplatform that leverages Artificial Intelligence (AI) techniques to accelerate\nthe research process by answering diverse questions from researchers.\nOpenResearcher is built based on Retrieval-Augmented Generation (RAG) to\nintegrate Large Language Models (LLMs) with up-to-date, domain-specific\nknowledge. Moreover, we develop various tools for OpenResearcher to understand\nresearchers' queries, search from the scientific literature, filter retrieved\ninformation, provide accurate and comprehensive answers, and self-refine these\nanswers. OpenResearcher can flexibly use these tools to balance efficiency and\neffectiveness. As a result, OpenResearcher enables researchers to save time and\nincrease their potential to discover new insights and drive scientific\nbreakthroughs. Demo, video, and code are available at:\nhttps://github.com/GAIR-NLP/OpenResearcher.\n","authors":["Yuxiang Zheng","Shichao Sun","Lin Qiu","Dongyu Ru","Cheng Jiayang","Xuefeng Li","Jifan Lin","Binjie Wang","Yun Luo","Renjie Pan","Yang Xu","Qingkai Min","Zizhao Zhang","Yiwen Wang","Wenjie Li","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06941v2.pdf","comment":"Accepted to Demo track of EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.04553v2","updated":"2024-10-28T07:38:11Z","published":"2024-06-06T23:44:33Z","title":"Better Late Than Never: Formulating and Benchmarking Recommendation\n Editing","summary":" Recommendation systems play a pivotal role in suggesting items to users based\non their preferences. However, in online platforms, these systems inevitably\noffer unsuitable recommendations due to limited model capacity, poor data\nquality, or evolving user interests. Enhancing user experience necessitates\nefficiently rectify such unsuitable recommendation behaviors. This paper\nintroduces a novel and significant task termed recommendation editing, which\nfocuses on modifying known and unsuitable recommendation behaviors.\nSpecifically, this task aims to adjust the recommendation model to eliminate\nknown unsuitable items without accessing training data or retraining the model.\nWe formally define the problem of recommendation editing with three primary\nobjectives: strict rectification, collaborative rectification, and concentrated\nrectification. Three evaluation metrics are developed to quantitatively assess\nthe achievement of each objective. We present a straightforward yet effective\nbenchmark for recommendation editing using novel Editing Bayesian Personalized\nRanking Loss. To demonstrate the effectiveness of the proposed method, we\nestablish a comprehensive benchmark that incorporates various methods from\nrelated fields. Codebase is available at\nhttps://github.com/cycl2018/Recommendation-Editing.\n","authors":["Chengyu Lai","Sheng Zhou","Zhimeng Jiang","Qiaoyu Tan","Yuanchen Bei","Jiawei Chen","Ningyu Zhang","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2406.04553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20778v1","updated":"2024-10-28T06:39:01Z","published":"2024-10-28T06:39:01Z","title":"Beyond Positive History: Re-ranking with List-level Hybrid Feedback","summary":" As the last stage of recommender systems, re-ranking generates a re-ordered\nlist that aligns with the user's preference. However, previous works generally\nfocus on item-level positive feedback as history (e.g., only clicked items) and\nignore that users provide positive or negative feedback on items in the entire\nlist. This list-level hybrid feedback can reveal users' holistic preferences\nand reflect users' comparison behavior patterns manifesting within a list. Such\npatterns could predict user behaviors on candidate lists, thus aiding better\nre-ranking. Despite appealing benefits, extracting and integrating preferences\nand behavior patterns from list-level hybrid feedback into re-ranking multiple\nitems remains challenging. To this end, we propose Re-ranking with List-level\nHybrid Feedback (dubbed RELIFE). It captures user's preferences and behavior\npatterns with three modules: a Disentangled Interest Miner to disentangle the\nuser's preferences into interests and disinterests, a Sequential Preference\nMixer to learn users' entangled preferences considering the context of\nfeedback, and a Comparison-aware Pattern Extractor to capture user's behavior\npatterns within each list. Moreover, for better integration of patterns,\ncontrastive learning is adopted to align the behavior patterns of candidate and\nhistorical lists. Extensive experiments show that RELIFE significantly\noutperforms SOTA re-ranking baselines.\n","authors":["Muyan Weng","Yunjia Xi","Weiwen Liu","Bo Chen","Jianghao Lin","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2410.20778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20730v1","updated":"2024-10-28T04:49:05Z","published":"2024-10-28T04:49:05Z","title":"GPRec: Bi-level User Modeling for Deep Recommenders","summary":" GPRec explicitly categorizes users into groups in a learnable manner and\naligns them with corresponding group embeddings. We design the dual group\nembedding space to offer a diverse perspective on group preferences by\ncontrasting positive and negative patterns. On the individual level, GPRec\nidentifies personal preferences from ID-like features and refines the obtained\nindividual representations to be independent of group ones, thereby providing a\nrobust complement to the group-level modeling. We also present various\nstrategies for the flexible integration of GPRec into various DRS models.\nRigorous testing of GPRec on three public datasets has demonstrated significant\nimprovements in recommendation quality.\n","authors":["Yejing Wang","Dong Xu","Xiangyu Zhao","Zhiren Mao","Peng Xiang","Ling Yan","Yao Hu","Zijian Zhang","Xuetao Wei","Qidong Liu"],"pdf_url":"https://arxiv.org/pdf/2410.20730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.08063v6","updated":"2024-10-28T03:39:32Z","published":"2022-02-16T13:44:00Z","title":"Information Extraction in Low-Resource Scenarios: Survey and Perspective","summary":" Information Extraction (IE) seeks to derive structured information from\nunstructured texts, often facing challenges in low-resource scenarios due to\ndata scarcity and unseen classes. This paper presents a review of neural\napproaches to low-resource IE from \\emph{traditional} and \\emph{LLM-based}\nperspectives, systematically categorizing them into a fine-grained taxonomy.\nThen we conduct empirical study on LLM-based methods compared with previous\nstate-of-the-art models, and discover that (1) well-tuned LMs are still\npredominant; (2) tuning open-resource LLMs and ICL with GPT family is promising\nin general; (3) the optimal LLM-based technical solution for low-resource IE\ncan be task-dependent. In addition, we discuss low-resource IE with LLMs,\nhighlight promising applications, and outline potential research directions.\nThis survey aims to foster understanding of this field, inspire new ideas, and\nencourage widespread applications in both academia and industry.\n","authors":["Shumin Deng","Yubo Ma","Ningyu Zhang","Yixin Cao","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2202.08063v6.pdf","comment":"Accepted by 15th IEEE International Conference on Knowledge Graphs\n (ICKG2024). Paper List:\n \\url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \\url{\n https://github.com/mayubo2333/LLM_project}"},{"id":"http://arxiv.org/abs/2410.20643v1","updated":"2024-10-28T00:39:22Z","published":"2024-10-28T00:39:22Z","title":"GenUP: Generative User Profilers as In-Context Learners for Next POI\n Recommender Systems","summary":" Traditional POI recommendation systems often lack transparency,\ninterpretability, and scrutability due to their reliance on dense vector-based\nuser embeddings. Furthermore, the cold-start problem -- where systems have\ninsufficient data for new users -- limits their ability to generate accurate\nrecommendations. Existing methods often address this by leveraging similar\ntrajectories from other users, but this approach can be computationally\nexpensive and increases the context length for LLM-based methods, making them\ndifficult to scale. To address these limitations, we propose a method that\ngenerates natural language (NL) user profiles from large-scale, location-based\nsocial network (LBSN) check-ins, utilizing robust personality assessments and\nbehavioral theories. These NL profiles capture user preferences, routines, and\nbehaviors, improving POI prediction accuracy while offering enhanced\ntransparency. By incorporating NL profiles as system prompts to LLMs, our\napproach reduces reliance on extensive historical data, while remaining\nflexible, easily updated, and computationally efficient. Our method is not only\ncompetitive with other LLM-based and complex agentic frameworks but is also\nmore scalable for real-world scenarios and on-device POI recommendations.\nResults demonstrate that our approach consistently outperforms baseline\nmethods, offering a more interpretable and resource-efficient solution for POI\nrecommendation systems. Our source code is available at:\n\\url{https://github.com/w11wo/GenUP}.\n","authors":["Wilson Wongso","Hao Xue","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2410.20643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20642v1","updated":"2024-10-28T00:38:06Z","published":"2024-10-28T00:38:06Z","title":"Collaborative Knowledge Fusion: A Novel Approach for Multi-task\n Recommender Systems via LLMs","summary":" Owing to the impressive general intelligence of large language models (LLMs),\nthere has been a growing trend to integrate them into recommender systems to\ngain a more profound insight into human interests and intentions. Existing\nLLMs-based recommender systems primarily leverage item attributes and user\ninteraction histories in textual format, improving the single task like rating\nprediction or explainable recommendation. Nevertheless, these approaches\noverlook the crucial contribution of traditional collaborative signals in\ndiscerning users' profound intentions and disregard the interrelatedness among\ntasks. To address these limitations, we introduce a novel framework known as\nCKF, specifically developed to boost multi-task recommendations via\npersonalized collaborative knowledge fusion into LLMs. Specifically, our method\nsynergizes traditional collaborative filtering models to produce collaborative\nembeddings, subsequently employing the meta-network to construct personalized\nmapping bridges tailored for each user. Upon mapped, the embeddings are\nincorporated into meticulously designed prompt templates and then fed into an\nadvanced LLM to represent user interests. To investigate the intrinsic\nrelationship among diverse recommendation tasks, we develop Multi-Lora, a new\nparameter-efficient approach for multi-task optimization, adept at distinctly\nsegregating task-shared and task-specific information. This method forges a\nconnection between LLMs and recommendation scenarios, while simultaneously\nenriching the supervisory signal through mutual knowledge transfer among\nvarious tasks. Extensive experiments and in-depth robustness analyses across\nfour common recommendation tasks on four large public data sets substantiate\nthe effectiveness and superiority of our framework.\n","authors":["Chuang Zhao","Xing Su","Ming He","Hongke Zhao","Jianping Fan","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2410.20642v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2405.14312v2","updated":"2024-10-28T19:33:30Z","published":"2024-05-23T08:32:58Z","title":"Improving Gloss-free Sign Language Translation by Reducing\n Representation Density","summary":" Gloss-free sign language translation (SLT) aims to develop well-performing\nSLT systems with no requirement for the costly gloss annotations, but currently\nstill lags behind gloss-based approaches significantly. In this paper, we\nidentify a representation density problem that could be a bottleneck in\nrestricting the performance of gloss-free SLT. Specifically, the representation\ndensity problem describes that the visual representations of semantically\ndistinct sign gestures tend to be closely packed together in feature space,\nwhich makes gloss-free methods struggle with distinguishing different sign\ngestures and suffer from a sharp performance drop. To address the\nrepresentation density problem, we introduce a simple but effective contrastive\nlearning strategy, namely SignCL, which encourages gloss-free models to learn\nmore discriminative feature representation in a self-supervised manner. Our\nexperiments demonstrate that the proposed SignCL can significantly reduce the\nrepresentation density and improve performance across various translation\nframeworks. Specifically, SignCL achieves a significant improvement in BLEU\nscore for the Sign Language Transformer and GFSLT-VLP on the CSL-Daily dataset\nby 39% and 46%, respectively, without any increase of model parameters.\nCompared to Sign2GPT, a state-of-the-art method based on large-scale\npre-trained vision and language models, SignCL achieves better performance with\nonly 35% of its parameters. Implementation and Checkpoints are available at\nhttps://github.com/JinhuiYE/SignCL.\n","authors":["Jinhui Ye","Xing Wang","Wenxiang Jiao","Junwei Liang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2405.14312v2.pdf","comment":"Accepted at NeurIPS'24; Representation Density Problem and\n Performance Drop in Gloss-free SLT"},{"id":"http://arxiv.org/abs/2410.21478v1","updated":"2024-10-28T19:32:17Z","published":"2024-10-28T19:32:17Z","title":"Knowledge Distillation for Real-Time Classification of Early Media in\n Voice Communications","summary":" This paper investigates the industrial setting of real-time classification of\nearly media exchanged during the initialization phase of voice calls. We\nexplore the application of state-of-the-art audio tagging models and highlight\nsome limitations when applied to the classification of early media. While most\nexisting approaches leverage convolutional neural networks, we propose a novel\napproach for low-resource requirements based on gradient-boosted trees. Our\napproach not only demonstrates a substantial improvement in runtime\nperformance, but also exhibits a comparable accuracy. We show that leveraging\nknowledge distillation and class aggregation techniques to train a simpler and\nsmaller model accelerates the classification of early media in voice calls. We\nprovide a detailed analysis of the results on a proprietary and publicly\navailable dataset, regarding accuracy and runtime performance. We additionally\nreport a case study of the achieved performance improvements at a regional data\ncenter in India.\n","authors":["Kemal Altwlkany","Hadžem Hadžić","Amar Kurić","Emanuel Lacic"],"pdf_url":"https://arxiv.org/pdf/2410.21478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21269v1","updated":"2024-10-28T17:58:15Z","published":"2024-10-28T17:58:15Z","title":"OmniSep: Unified Omni-Modality Sound Separation with Query-Mixup","summary":" The scaling up has brought tremendous success in the fields of vision and\nlanguage in recent years. When it comes to audio, however, researchers\nencounter a major challenge in scaling up the training data, as most natural\naudio contains diverse interfering signals. To address this limitation, we\nintroduce Omni-modal Sound Separation (OmniSep), a novel framework capable of\nisolating clean soundtracks based on omni-modal queries, encompassing both\nsingle-modal and multi-modal composed queries. Specifically, we introduce the\nQuery-Mixup strategy, which blends query features from different modalities\nduring training. This enables OmniSep to optimize multiple modalities\nconcurrently, effectively bringing all modalities under a unified framework for\nsound separation. We further enhance this flexibility by allowing queries to\ninfluence sound separation positively or negatively, facilitating the retention\nor removal of specific sounds as desired. Finally, OmniSep employs a\nretrieval-augmented approach known as Query-Aug, which enables open-vocabulary\nsound separation. Experimental evaluations on MUSIC, VGGSOUND-CLEAN+, and\nMUSIC-CLEAN+ datasets demonstrate effectiveness of OmniSep, achieving\nstate-of-the-art performance in text-, image-, and audio-queried sound\nseparation tasks. For samples and further information, please visit the demo\npage at \\url{https://omnisep.github.io/}.\n","authors":["Xize Cheng","Siqi Zheng","Zehan Wang","Minghui Fang","Ziang Zhang","Rongjie Huang","Ziyang Ma","Shengpeng Ji","Jialong Zuo","Tao Jin","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.21269v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2410.21061v1","updated":"2024-10-28T14:22:08Z","published":"2024-10-28T14:22:08Z","title":"Kandinsky 3: Text-to-Image Synthesis for Multifunctional Generative\n Framework","summary":" Text-to-image (T2I) diffusion models are popular for introducing image\nmanipulation methods, such as editing, image fusion, inpainting, etc. At the\nsame time, image-to-video (I2V) and text-to-video (T2V) models are also built\non top of T2I models. We present Kandinsky 3, a novel T2I model based on latent\ndiffusion, achieving a high level of quality and photorealism. The key feature\nof the new architecture is the simplicity and efficiency of its adaptation for\nmany types of generation tasks. We extend the base T2I model for various\napplications and create a multifunctional generation system that includes\ntext-guided inpainting/outpainting, image fusion, text-image fusion, image\nvariations generation, I2V and T2V generation. We also present a distilled\nversion of the T2I model, evaluating inference in 4 steps of the reverse\nprocess without reducing image quality and 3 times faster than the base model.\nWe deployed a user-friendly demo system in which all the features can be tested\nin the public domain. Additionally, we released the source code and checkpoints\nfor the Kandinsky 3 and extended models. Human evaluations show that Kandinsky\n3 demonstrates one of the highest quality scores among open source generation\nsystems.\n","authors":["Vladimir Arkhipkin","Viacheslav Vasilev","Andrei Filatov","Igor Pavlov","Julia Agafonova","Nikolai Gerasimenko","Anna Averchenkova","Evelina Mironova","Anton Bukashkin","Konstantin Kulikov","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2410.21061v1.pdf","comment":"Accepted for EMNLP 2024 (Demo track)"},{"id":"http://arxiv.org/abs/2410.21029v1","updated":"2024-10-28T13:51:03Z","published":"2024-10-28T13:51:03Z","title":"FairStream: Fair Multimedia Streaming Benchmark for Reinforcement\n Learning Agents","summary":" Multimedia streaming accounts for the majority of traffic in today's\ninternet. Mechanisms like adaptive bitrate streaming control the bitrate of a\nstream based on the estimated bandwidth, ideally resulting in smooth playback\nand a good Quality of Experience (QoE). However, selecting the optimal bitrate\nis challenging under volatile network conditions. This motivated researchers to\ntrain Reinforcement Learning (RL) agents for multimedia streaming. The\nconsidered training environments are often simplified, leading to promising\nresults with limited applicability. Additionally, the QoE fairness across\nmultiple streams is seldom considered by recent RL approaches. With this work,\nwe propose a novel multi-agent environment that comprises multiple challenges\nof fair multimedia streaming: partial observability, multiple objectives, agent\nheterogeneity and asynchronicity. We provide and analyze baseline approaches\nacross five different traffic classes to gain detailed insights into the\nbehavior of the considered agents, and show that the commonly used Proximal\nPolicy Optimization (PPO) algorithm is outperformed by a simple greedy\nheuristic. Future work includes the adaptation of multi-agent RL algorithms and\nfurther expansions of the environment.\n","authors":["Jannis Weil","Jonas Ringsdorf","Julian Barthel","Yi-Ping Phoebe Chen","Tobias Meuser"],"pdf_url":"https://arxiv.org/pdf/2410.21029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18709v2","updated":"2024-10-28T12:19:39Z","published":"2023-10-28T13:37:52Z","title":"Audio-Visual Instance Segmentation","summary":" In this paper, we propose a new multi-modal task, termed audio-visual\ninstance segmentation (AVIS), which aims to simultaneously identify, segment\nand track individual sounding object instances in audible videos. To facilitate\nthis research, we introduce a high-quality benchmark named AVISeg, containing\nover 90K instance masks from 26 semantic categories in 926 long videos.\nAdditionally, we propose a strong baseline model for this task. Our model first\nlocalizes sound source within each frame, and condenses object-specific\ncontexts into concise tokens. Then it builds long-range audio-visual\ndependencies between these tokens using window-based attention, and tracks\nsounding objects among the entire video sequences. Extensive experiments reveal\nthat our method performs best on AVISeg, surpassing the existing methods from\nrelated tasks. We further conduct the evaluation on several multi-modal large\nmodels; however, they exhibits subpar performance on instance-level sound\nsource localization and temporal perception. We expect that AVIS will inspire\nthe community towards a more comprehensive multi-modal understanding.\n","authors":["Ruohao Guo","Xianghua Ying","Yaru Chen","Dantong Niu","Guangyao Li","Liao Qu","Yanyu Qi","Bowei Xing","Wenzhen Yue","Ji Shi","Qixun Wang","Peiliang Zhang","Buwen Liang"],"pdf_url":"https://arxiv.org/pdf/2310.18709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06171v3","updated":"2024-10-28T12:11:12Z","published":"2023-12-11T07:20:42Z","title":"Joint Explicit and Implicit Cross-Modal Interaction Network for Anterior\n Chamber Inflammation Diagnosis","summary":" Uveitis demands the precise diagnosis of anterior chamber inflammation (ACI)\nfor optimal treatment. However, current diagnostic methods only rely on a\nlimited single-modal disease perspective, which leads to poor performance. In\nthis paper, we investigate a promising yet challenging way to fuse multimodal\ndata for ACI diagnosis. Notably, existing fusion paradigms focus on empowering\nimplicit modality interactions (i.e., self-attention and its variants), but\nneglect to inject explicit modality interactions, especially from clinical\nknowledge and imaging property. To this end, we propose a jointly Explicit and\nimplicit Cross-Modal Interaction Network (EiCI-Net) for Anterior Chamber\nInflammation Diagnosis that uses anterior segment optical coherence tomography\n(AS-OCT) images, slit-lamp images, and clinical data jointly. Specifically, we\nfirst develop CNN-Based Encoders and Tabular Processing Module (TPM) to extract\nefficient feature representations in different modalities. Then, we devise an\nExplicit Cross-Modal Interaction Module (ECIM) to generate attention maps as a\nkind of explicit clinical knowledge based on the tabular feature maps, then\nintegrated them into the slit-lamp feature maps, allowing the CNN-Based Encoder\nto focus on more effective informativeness of the slit-lamp images. After that,\nthe Implicit Cross-Modal Interaction Module (ICIM), a transformer-based\nnetwork, further implicitly enhances modality interactions. Finally, we\nconstruct a considerable real-world dataset from our collaborative hospital and\nconduct sufficient experiments to demonstrate the superior performance of our\nproposed EiCI-Net compared with the state-of-the-art classification methods in\nvarious metrics.\n","authors":["Qian Shao","Ye Dai","Haochao Ying","Kan Xu","Jinhong Wang","Wei Chi","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2312.06171v3.pdf","comment":"IEEE MedAI 2024"},{"id":"http://arxiv.org/abs/2410.20898v1","updated":"2024-10-28T10:26:19Z","published":"2024-10-28T10:26:19Z","title":"Diff-Instruct*: Towards Human-Preferred One-step Text-to-image\n Generative Models","summary":" In this paper, we introduce the Diff-Instruct*(DI*), a data-free approach for\nbuilding one-step text-to-image generative models that align with human\npreference while maintaining the ability to generate highly realistic images.\nWe frame human preference alignment as online reinforcement learning using\nhuman feedback (RLHF), where the goal is to maximize the reward function while\nregularizing the generator distribution to remain close to a reference\ndiffusion process. Unlike traditional RLHF approaches, which rely on the KL\ndivergence for regularization, we introduce a novel score-based divergence\nregularization, which leads to significantly better performances. Although the\ndirect calculation of this divergence remains intractable, we demonstrate that\nwe can efficiently compute its \\emph{gradient} by deriving an equivalent yet\ntractable loss function. Remarkably, with Stable Diffusion V1.5 as the\nreference diffusion model, DI* outperforms \\emph{all} previously leading models\nby a large margin. When using the 0.6B PixelArt-$\\alpha$ model as the reference\ndiffusion, DI* achieves a new record Aesthetic Score of 6.30 and an Image\nReward of 1.31 with only a single generation step, almost doubling the scores\nof the rest of the models with similar sizes. It also achieves an HPSv2 score\nof 28.70, establishing a new state-of-the-art benchmark. We also observe that\nDI* can improve the layout and enrich the colors of generated images.\n","authors":["Weijian Luo","Colin Zhang","Debing Zhang","Zhengyang Geng"],"pdf_url":"https://arxiv.org/pdf/2410.20898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20855v1","updated":"2024-10-28T09:19:28Z","published":"2024-10-28T09:19:28Z","title":"ByteNet: Rethinking Multimedia File Fragment Classification through\n Visual Perspectives","summary":" Multimedia file fragment classification (MFFC) aims to identify file fragment\ntypes, e.g., image/video, audio, and text without system metadata. It is of\nvital importance in multimedia storage and communication. Existing MFFC methods\ntypically treat fragments as 1D byte sequences and emphasize the relations\nbetween separate bytes (interbytes) for classification. However, the more\ninformative relations inside bytes (intrabytes) are overlooked and seldom\ninvestigated. By looking inside bytes, the bit-level details of file fragments\ncan be accessed, enabling a more accurate classification. Motivated by this, we\nfirst propose Byte2Image, a novel visual representation model that incorporates\npreviously overlooked intrabyte information into file fragments and\nreinterprets these fragments as 2D grayscale images. This model involves a\nsliding byte window to reveal the intrabyte information and a rowwise stacking\nof intrabyte ngrams for embedding fragments into a 2D space. Thus, complex\ninterbyte and intrabyte correlations can be mined simultaneously using powerful\nvision networks. Additionally, we propose an end-to-end dual-branch network\nByteNet to enhance robust correlation mining and feature representation.\nByteNet makes full use of the raw 1D byte sequence and the converted 2D image\nthrough a shallow byte branch feature extraction (BBFE) and a deep image branch\nfeature extraction (IBFE) network. In particular, the BBFE, composed of a\nsingle fully-connected layer, adaptively recognizes the co-occurrence of\nseveral some specific bytes within the raw byte sequence, while the IBFE, built\non a vision Transformer, effectively mines the complex interbyte and intrabyte\ncorrelations from the converted image. Experiments on the two representative\nbenchmarks, including 14 cases, validate that our proposed method outperforms\nstate-of-the-art approaches on different cases by up to 12.2%.\n","authors":["Wenyang Liu","Kejun Wu","Tianyi Liu","Yi Wang","Kim-Hui Yap","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2410.20855v1.pdf","comment":"Accepted in TMM"},{"id":"http://arxiv.org/abs/2404.13289v2","updated":"2024-10-28T03:35:22Z","published":"2024-04-20T06:32:00Z","title":"Double Mixture: Towards Continual Event Detection from Speech","summary":" Speech event detection is crucial for multimedia retrieval, involving the\ntagging of both semantic and acoustic events. Traditional ASR systems often\noverlook the interplay between these events, focusing solely on content, even\nthough the interpretation of dialogue can vary with environmental context. This\npaper tackles two primary challenges in speech event detection: the continual\nintegration of new events without forgetting previous ones, and the\ndisentanglement of semantic from acoustic events. We introduce a new task,\ncontinual event detection from speech, for which we also provide two benchmark\ndatasets. To address the challenges of catastrophic forgetting and effective\ndisentanglement, we propose a novel method, 'Double Mixture.' This method\nmerges speech expertise with robust memory mechanisms to enhance adaptability\nand prevent forgetting. Our comprehensive experiments show that this task\npresents significant challenges that are not effectively addressed by current\nstate-of-the-art methods in either computer vision or natural language\nprocessing. Our approach achieves the lowest rates of forgetting and the\nhighest levels of generalization, proving robust across various continual\nlearning sequences. Our code and data are available at\nhttps://anonymous.4open.science/status/Continual-SpeechED-6461.\n","authors":["Jingqi Kang","Tongtong Wu","Jinming Zhao","Guitao Wang","Yinwei Wei","Hao Yang","Guilin Qi","Yuan-Fang Li","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2404.13289v2.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.14703v2","updated":"2024-10-28T03:32:28Z","published":"2024-09-23T04:49:08Z","title":"MemeCLIP: Leveraging CLIP Representations for Multimodal Meme\n Classification","summary":" The complexity of text-embedded images presents a formidable challenge in\nmachine learning given the need for multimodal understanding of multiple\naspects of expression conveyed by them. While previous research in multimodal\nanalysis has primarily focused on singular aspects such as hate speech and its\nsubclasses, this study expands this focus to encompass multiple aspects of\nlinguistics: hate, targets of hate, stance, and humor. We introduce a novel\ndataset PrideMM comprising 5,063 text-embedded images associated with the\nLGBTQ+ Pride movement, thereby addressing a serious gap in existing resources.\nWe conduct extensive experimentation on PrideMM by using unimodal and\nmultimodal baseline methods to establish benchmarks for each task.\nAdditionally, we propose a novel framework MemeCLIP for efficient downstream\nlearning while preserving the knowledge of the pre-trained CLIP model. The\nresults of our experiments show that MemeCLIP achieves superior performance\ncompared to previously proposed frameworks on two real-world datasets. We\nfurther compare the performance of MemeCLIP and zero-shot GPT-4 on the hate\nclassification task. Finally, we discuss the shortcomings of our model by\nqualitatively analyzing misclassified samples. Our code and dataset are\npublicly available at: https://github.com/SiddhantBikram/MemeCLIP.\n","authors":["Siddhant Bikram Shah","Shuvam Shiwakoti","Maheep Chaudhary","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.14703v2.pdf","comment":"Accepted to EMNLP 2024 (Main)"},{"id":"http://arxiv.org/abs/2410.20670v1","updated":"2024-10-28T02:05:10Z","published":"2024-10-28T02:05:10Z","title":"Segmenting Watermarked Texts From Language Models","summary":" Watermarking is a technique that involves embedding nearly unnoticeable\nstatistical signals within generated content to help trace its source. This\nwork focuses on a scenario where an untrusted third-party user sends prompts to\na trusted language model (LLM) provider, who then generates a text from their\nLLM with a watermark. This setup makes it possible for a detector to later\nidentify the source of the text if the user publishes it. The user can modify\nthe generated text by substitutions, insertions, or deletions. Our objective is\nto develop a statistical method to detect if a published text is LLM-generated\nfrom the perspective of a detector. We further propose a methodology to segment\nthe published text into watermarked and non-watermarked sub-strings. The\nproposed approach is built upon randomization tests and change point detection\ntechniques. We demonstrate that our method ensures Type I and Type II error\ncontrol and can accurately identify watermarked sub-strings by finding the\ncorresponding change point locations. To validate our technique, we apply it to\ntexts generated by several language models with prompts extracted from Google's\nC4 dataset and obtain encouraging numerical results. We release all code\npublicly at https://github.com/doccstat/llm-watermark-cpd.\n","authors":["Xingchi Li","Guanxun Li","Xianyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.20670v1.pdf","comment":"25 pages, 12 figures, 2 tables, NeurIPS 2024"}]},"2024-10-27T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.03064v2","updated":"2024-10-27T22:56:17Z","published":"2024-10-04T01:04:41Z","title":"Geometric Collaborative Filtering with Convergence","summary":" Latent variable collaborative filtering methods have been a standard approach\nto modelling user-click interactions due to their simplicity and effectiveness.\nHowever, there is limited work on analyzing the mathematical properties of\nthese methods in particular on preventing the overfitting towards the identity,\nand such methods typically utilize loss functions that overlook the geometry\nbetween items. In this work, we introduce a notion of generalization gap in\ncollaborative filtering and analyze this with respect to latent collaborative\nfiltering models. We present a geometric upper bound that gives rise to loss\nfunctions, and a way to meaningfully utilize the geometry of item-metadata to\nimprove recommendations. We show how these losses can be minimized and gives\nthe recipe to a new latent collaborative filtering algorithm, which we refer to\nas GeoCF, due to the geometric nature of our results. We then show\nexperimentally that our proposed GeoCF algorithm can outperform other all\nexisting methods on the Movielens20M and Netflix datasets, as well as two\nlarge-scale internal datasets. In summary, our work proposes a theoretically\nsound method which paves a way to better understand generalization of\ncollaborative filtering at large.\n","authors":["Hisham Husain","Julien Monteil"],"pdf_url":"https://arxiv.org/pdf/2410.03064v2.pdf","comment":"13 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2410.20598v1","updated":"2024-10-27T21:12:12Z","published":"2024-10-27T21:12:12Z","title":"R^3AG: First Workshop on Refined and Reliable Retrieval Augmented\n Generation","summary":" Retrieval-augmented generation (RAG) has gained wide attention as the key\ncomponent to improve generative models with external knowledge augmentation\nfrom information retrieval. It has shown great prominence in enhancing the\nfunctionality and performance of large language model (LLM)-based applications.\nHowever, with the comprehensive application of RAG, more and more problems and\nlimitations have been identified, thus urgently requiring further fundamental\nexploration to improve current RAG frameworks. This workshop aims to explore in\ndepth how to conduct refined and reliable RAG for downstream AI tasks.\n To this end, we propose to organize the first R3AG workshop at SIGIR-AP 2024\nto call for participants to re-examine and formulate the basic principles and\npractical implementation of refined and reliable RAG. The workshop serves as a\nplatform for both academia and industry researchers to conduct discussions,\nshare insights, and foster research to build the next generation of RAG\nsystems. Participants will engage in discussions and presentations focusing on\nfundamental challenges, cutting-edge research, and potential pathways to\nimprove RAG. At the end of the workshop, we aim to have a clearer understanding\nof how to improve the reliability and applicability of RAG with more robust\ninformation retrieval and language generation.\n","authors":["Zihan Wang","Xuri Ge","Joemon M. Jose","Haitao Yu","Weizhi Ma","Zhaochun Ren","Xin Xin"],"pdf_url":"https://arxiv.org/pdf/2410.20598v1.pdf","comment":"R^3AG workshop overview at SIGIR-AP 2024"},{"id":"http://arxiv.org/abs/2410.20580v1","updated":"2024-10-27T20:21:14Z","published":"2024-10-27T20:21:14Z","title":"Coherence-guided Preference Disentanglement for Cross-domain\n Recommendations","summary":" Discovering user preferences across different domains is pivotal in\ncross-domain recommendation systems, particularly when platforms lack\ncomprehensive user-item interactive data. The limited presence of shared users\noften hampers the effective modeling of common preferences. While leveraging\nshared items' attributes, such as category and popularity, can enhance\ncross-domain recommendation performance, the scarcity of shared items between\ndomains has limited research in this area. To address this, we propose a\nCoherence-guided Preference Disentanglement (CoPD) method aimed at improving\ncross-domain recommendation by i) explicitly extracting shared item attributes\nto guide the learning of shared user preferences and ii) disentangling these\npreferences to identify specific user interests transferred between domains.\nCoPD introduces coherence constraints on item embeddings of shared and specific\ndomains, aiding in extracting shared attributes. Moreover, it utilizes these\nattributes to guide the disentanglement of user preferences into separate\nembeddings for interest and conformity through a popularity-weighted loss.\nExperiments conducted on real-world datasets demonstrate the superior\nperformance of our proposed CoPD over existing competitive baselines,\nhighlighting its effectiveness in enhancing cross-domain recommendation\nperformance.\n","authors":["Zongyi Xiang","Yan Zhang","Lixin Duan","Hongzhi Yin","Ivor W. Tsang"],"pdf_url":"https://arxiv.org/pdf/2410.20580v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2410.20540v1","updated":"2024-10-27T18:15:18Z","published":"2024-10-27T18:15:18Z","title":"Automatic Estimation of Singing Voice Musical Dynamics","summary":" Musical dynamics form a core part of expressive singing voice performances.\nHowever, automatic analysis of musical dynamics for singing voice has received\nlimited attention partly due to the scarcity of suitable datasets and a lack of\nclear evaluation frameworks. To address this challenge, we propose a\nmethodology for dataset curation. Employing the proposed methodology, we\ncompile a dataset comprising 509 musical dynamics annotated singing voice\nperformances, aligned with 163 score files, leveraging state-of-the-art source\nseparation and alignment techniques. The scores are sourced from the OpenScore\nLieder corpus of romantic-era compositions, widely known for its wealth of\nexpressive annotations. Utilizing the curated dataset, we train a multi-head\nattention based CNN model with varying window sizes to evaluate the\neffectiveness of estimating musical dynamics. We explored two distinct\nperceptually motivated input representations for the model training: log-Mel\nspectrum and bark-scale based features. For testing, we manually curate another\ndataset of 25 musical dynamics annotated performances in collaboration with a\nprofessional vocalist. We conclude through our experiments that bark-scale\nbased features outperform log-Mel-features for the task of singing voice\ndynamics prediction. The dataset along with the code is shared publicly for\nfurther research on the topic.\n","authors":["Jyoti Narang","Nazif Can Tamer","Viviana De La Vega","Xavier Serra"],"pdf_url":"https://arxiv.org/pdf/2410.20540v1.pdf","comment":"To be published in ISMIR 2024, 6 pages"},{"id":"http://arxiv.org/abs/2306.09604v2","updated":"2024-10-27T15:25:59Z","published":"2023-06-16T03:16:48Z","title":"Personalized Summarization of Scientific Scholarly Texts","summary":" In this paper, we present a proposal for an unsupervised algorithm, P-Summ,\nthat generates an extractive summary of scientific scholarly text to meet the\npersonal knowledge needs of the user. The method delves into the latent\nsemantic space of the document exposed by Weighted Non-negative Matrix\nFactorization, and scores sentences in consonance with the knowledge needs of\nthe user. The novelty of the algorithm lies in its ability to include desired\nknowledge and eliminate unwanted knowledge in the personal summary.\n We also propose a multi-granular evaluation framework, which assesses the\nquality of generated personal summaries at three levels of granularity -\nsentence, terms and semantic. The framework uses system generated generic\nsummary instead of human generated summary as gold standard for evaluating the\nquality of personal summary generated by the algorithm. The effectiveness of\nthe algorithm at the semantic level is evaluated by taking into account the\nreference summary and the knowledge signals. We evaluate the performance of\nP-Summ algorithm over four data-sets consisting of scientific articles. Our\nempirical investigations reveal that the proposed method has the capability to\nmeet negative (or positive) knowledge preferences of the user.\n","authors":["Alka Khurana","Vasudha Bhatnagar","Vikas Kumar"],"pdf_url":"https://arxiv.org/pdf/2306.09604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20401v1","updated":"2024-10-27T10:24:23Z","published":"2024-10-27T10:24:23Z","title":"Prototypical Extreme Multi-label Classification with a Dynamic Margin\n Loss","summary":" Extreme Multi-label Classification (XMC) methods predict relevant labels for\na given query in an extremely large label space. Recent works in XMC address\nthis problem using deep encoders that project text descriptions to an embedding\nspace suitable for recovering the closest labels. However, learning deep models\ncan be computationally expensive in large output spaces, resulting in a\ntrade-off between high performing brute-force approaches and efficient\nsolutions. In this paper, we propose PRIME, a XMC method that employs a novel\nprototypical contrastive learning technique to reconcile efficiency and\nperformance surpassing brute-force approaches. We frame XMC as a\ndata-to-prototype prediction task where label prototypes aggregate information\nfrom related queries. More precisely, we use a shallow transformer encoder that\nwe coin as Label Prototype Network, which enriches label representations by\naggregating text-based embeddings, label centroids and learnable free vectors.\nWe jointly train a deep encoder and the Label Prototype Network using an\nadaptive triplet loss objective that better adapts to the high granularity and\nambiguity of extreme label spaces. PRIME achieves state-of-the-art results in\nseveral public benchmarks of different sizes and domains, while keeping the\nmodel efficient.\n","authors":["Kunal Dahiya","Diego Ortego","David Jiménez"],"pdf_url":"https://arxiv.org/pdf/2410.20401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20381v1","updated":"2024-10-27T09:12:51Z","published":"2024-10-27T09:12:51Z","title":"Efficient and Effective Retrieval of Dense-Sparse Hybrid Vectors using\n Graph-based Approximate Nearest Neighbor Search","summary":" ANNS for embedded vector representations of texts is commonly used in\ninformation retrieval, with two important information representations being\nsparse and dense vectors. While it has been shown that combining these\nrepresentations improves accuracy, the current method of conducting sparse and\ndense vector searches separately suffers from low scalability and high system\ncomplexity. Alternatively, building a unified index faces challenges with\naccuracy and efficiency. To address these issues, we propose a graph-based ANNS\nalgorithm for dense-sparse hybrid vectors. Firstly, we propose a distribution\nalignment method to improve accuracy, which pre-samples dense and sparse\nvectors to analyze their distance distribution statistic, resulting in a\n1%$\\sim$9% increase in accuracy. Secondly, to improve efficiency, we design an\nadaptive two-stage computation strategy that initially computes dense distances\nonly and later computes hybrid distances. Further, we prune the sparse vectors\nto speed up the calculation. Compared to naive implementation, we achieve\n$\\sim2.1\\times$ acceleration. Thorough experiments show that our algorithm\nachieves 8.9x$\\sim$11.7x throughput at equal accuracy compared to existing\nhybrid vector search algorithms.\n","authors":["Haoyu Zhang","Jun Liu","Zhenhua Zhu","Shulin Zeng","Maojia Sheng","Tao Yang","Guohao Dai","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2410.20381v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2410.20352v1","updated":"2024-10-27T06:50:43Z","published":"2024-10-27T06:50:43Z","title":"An approach to hummed-tune and song sequences matching","summary":" Melody stuck in your head, also known as \"earworm\", is tough to get rid of,\nunless you listen to it again or sing it out loud. But what if you can not find\nthe name of that song? It must be an intolerable feeling. Recognizing a song\nname base on humming sound is not an easy task for a human being and should be\ndone by machines. However, there is no research paper published about hum tune\nrecognition. Adapting from Hum2Song Zalo AI Challenge 2021 - a competition\nabout querying the name of a song by user's giving humming tune, which is\nsimilar to Google's Hum to Search. This paper covers details about the\npre-processed data from the original type (mp3) to usable form for training and\ninference. In training an embedding model for the feature extraction phase, we\nran experiments with some states of the art, such as ResNet, VGG, AlexNet,\nMobileNetV2. And for the inference phase, we use the Faiss module to\neffectively search for a song that matched the sequence of humming sound. The\nresult comes at nearly 94\\% in MRR@10 metric on the public test set, along with\nthe top 1 result on the public leaderboard.\n","authors":["Loc Bao Pham","Huong Hoang Luong","Phu Thien Tran","Phuc Hoang Ngo","Vi Hoang Nguyen","Thinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2410.20352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20301v1","updated":"2024-10-27T00:49:52Z","published":"2024-10-27T00:49:52Z","title":"WindTunnel -- A Framework for Community Aware Sampling of Large Corpora","summary":" Conducting comprehensive information retrieval experiments, such as in search\nor retrieval augmented generation, often comes with high computational costs.\nThis is because evaluating a retrieval algorithm requires indexing the entire\ncorpus, which is significantly larger than the set of (query, result) pairs\nunder evaluation. This issue is especially pronounced in big data and neural\nretrieval, where indexing becomes increasingly time-consuming and complex. In\nthis paper, we present WindTunnel, a novel framework developed at Yext to\ngenerate representative samples of large corpora, enabling efficient end-to-end\ninformation retrieval experiments. By preserving the community structure of the\ndataset, WindTunnel overcomes limitations in current sampling methods,\nproviding more accurate evaluations.\n","authors":["Michael Iannelli"],"pdf_url":"https://arxiv.org/pdf/2410.20301v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2405.12221v2","updated":"2024-10-27T21:47:14Z","published":"2024-05-20T17:59:59Z","title":"Images that Sound: Composing Images and Sounds on a Single Canvas","summary":" Spectrograms are 2D representations of sound that look very different from\nthe images found in our visual world. And natural images, when played as\nspectrograms, make unnatural sounds. In this paper, we show that it is possible\nto synthesize spectrograms that simultaneously look like natural images and\nsound like natural audio. We call these visual spectrograms images that sound.\nOur approach is simple and zero-shot, and it leverages pre-trained\ntext-to-image and text-to-spectrogram diffusion models that operate in a shared\nlatent space. During the reverse process, we denoise noisy latents with both\nthe audio and image diffusion models in parallel, resulting in a sample that is\nlikely under both models. Through quantitative evaluations and perceptual\nstudies, we find that our method successfully generates spectrograms that align\nwith a desired audio prompt while also taking the visual appearance of a\ndesired image prompt. Please see our project page for video results:\nhttps://ificl.github.io/images-that-sound/\n","authors":["Ziyang Chen","Daniel Geng","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2405.12221v2.pdf","comment":"Accepted to NeurIPS 2024. Project site:\n https://ificl.github.io/images-that-sound/"},{"id":"http://arxiv.org/abs/2410.20518v1","updated":"2024-10-27T17:00:55Z","published":"2024-10-27T17:00:55Z","title":"MidiTok Visualizer: a tool for visualization and analysis of tokenized\n MIDI symbolic music","summary":" Symbolic music research plays a crucial role in music-related machine\nlearning, but MIDI data can be complex for those without musical expertise. To\naddress this issue, we present MidiTok Visualizer, a web application designed\nto facilitate the exploration and visualization of various MIDI tokenization\nmethods from the MidiTok Python package. MidiTok Visualizer offers numerous\ncustomizable parameters, enabling users to upload MIDI files to visualize\ntokenized data alongside an interactive piano roll.\n","authors":["Michał Wiszenko","Kacper Stefański","Piotr Malesa","Łukasz Pokorzyński","Mateusz Modrzejewski"],"pdf_url":"https://arxiv.org/pdf/2410.20518v1.pdf","comment":"in Extended Abstracts for the Late-Breaking Demo Sessionof the 25th\n Int. Society for Music Information Retrieval Conf., San Francisco, United\n States, 2024"},{"id":"http://arxiv.org/abs/2403.08773v2","updated":"2024-10-27T06:01:49Z","published":"2024-01-18T12:45:25Z","title":"Veagle: Advancements in Multimodal Representation Learning","summary":" Lately, researchers in artificial intelligence have been really interested in\nhow language and vision come together, giving rise to the development of\nmultimodal models that aim to seamlessly integrate textual and visual\ninformation. Multimodal models, an extension of Large Language Models (LLMs),\nhave exhibited remarkable capabilities in addressing a diverse array of tasks,\nranging from image captioning and visual question answering (VQA) to visual\ngrounding. While these models have showcased significant advancements,\nchallenges persist in accurately interpreting images and answering the\nquestion, a common occurrence in real-world scenarios. This paper introduces a\nnovel approach to enhance the multimodal capabilities of existing models. In\nresponse to the limitations observed in current Vision Language Models (VLMs)\nand Multimodal Large Language Models (MLLMs), our proposed model Veagle,\nincorporates a unique mechanism inspired by the successes and insights of\nprevious works. Veagle leverages a dynamic mechanism to project encoded visual\ninformation directly into the language model. This dynamic approach allows for\na more nuanced understanding of intricate details present in visual contexts.\nTo validate the effectiveness of Veagle, we conduct comprehensive experiments\non benchmark datasets, emphasizing tasks such as visual question answering and\nimage understanding. Our results indicate a improvement of 5-6 \\% in\nperformance, with Veagle outperforming existing models by a notable margin. The\noutcomes underscore the model's versatility and applicability beyond\ntraditional benchmarks.\n","authors":["Rajat Chawla","Arkajit Datta","Tushar Verma","Adarsh Jha","Anmol Gautam","Ayush Vatsal","Sukrit Chaterjee","Mukunda NS","Ishaan Bhola"],"pdf_url":"https://arxiv.org/pdf/2403.08773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00320v3","updated":"2024-10-27T03:52:29Z","published":"2024-06-01T06:40:22Z","title":"Frieren: Efficient Video-to-Audio Generation Network with Rectified Flow\n Matching","summary":" Video-to-audio (V2A) generation aims to synthesize content-matching audio\nfrom silent video, and it remains challenging to build V2A models with high\ngeneration quality, efficiency, and visual-audio temporal synchrony. We propose\nFrieren, a V2A model based on rectified flow matching. Frieren regresses the\nconditional transport vector field from noise to spectrogram latent with\nstraight paths and conducts sampling by solving ODE, outperforming\nautoregressive and score-based models in terms of audio quality. By employing a\nnon-autoregressive vector field estimator based on a feed-forward transformer\nand channel-level cross-modal feature fusion with strong temporal alignment,\nour model generates audio that is highly synchronized with the input video.\nFurthermore, through reflow and one-step distillation with guided vector field,\nour model can generate decent audio in a few, or even only one sampling step.\nExperiments indicate that Frieren achieves state-of-the-art performance in both\ngeneration quality and temporal alignment on VGGSound, with alignment accuracy\nreaching 97.22%, and 6.2% improvement in inception score over the strong\ndiffusion-based baseline. Audio samples are available at\nhttp://frieren-v2a.github.io.\n","authors":["Yongqi Wang","Wenxiang Guo","Rongjie Huang","Jiawei Huang","Zehan Wang","Fuming You","Ruiqi Li","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00320v3.pdf","comment":"accepted by NeurIPS 2024"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 51 + +
+
+
+ + ♻ ☆ Regression-aware Inference with LLMs EMNLP + + +
+ Large language models (LLMs) have shown strong results on a range of +applications, including regression and scoring tasks. Typically, one obtains +outputs from an LLM via autoregressive sampling from the model's output +distribution. We show that this inference strategy can be sub-optimal for +common regression and scoring evaluation metrics. As a remedy, we build on +prior work on Minimum Bayes Risk decoding, and propose alternate inference +strategies that estimate the Bayes-optimal solution for regression and scoring +metrics in closed-form from sampled responses. We show that our proposal +significantly improves over baselines across datasets and models. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ HDLCopilot: Natural Language Exploration of Hardware Designs and + Libraries + + +
+ Hardware design workflows often involve working with Process Design Kits +(PDKs) from various fabrication labs, each containing its own set of standard +cell libraries optimized for metrics such as speed, power, or density. These +libraries include multiple views for information on timing and electrical +properties of cells, cell layout details, and process design rules. Engineers +typically navigate between the design and the target technology to make +informed decisions on different design scenarios, such as selecting specific +gates for area optimization or enhancing critical path speed. Navigating this +complex landscape to retrieve specific information about gates or design rules +is often time-consuming and error-prone. To address this, we present +HDLCopilot, a multi-agent collaborative framework powered by large language +models that enables engineers to streamline interactions with hardware design +and PDKs through natural language queries. HDLCopilot enables engineers to +quickly access relevant information on gates and design rules, evaluate +tradeoffs related to area, speed, and power in order to make informed decisions +efficiently and accurately. The framework achieves an execution accuracy of +96.33\% on a diverse set of complex natural language queries. HDLCopilot +positions itself as a powerful assistant in hardware design workflows, +enhancing productivity and reducing potential human errors. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ DuQuant: Distributing Outliers via Dual Transformation Makes Stronger + Quantized LLMs NeurIPS 2024 + + +
+ Quantization of large language models (LLMs) faces significant challenges, +particularly due to the presence of outlier activations that impede efficient +low-bit representation. Traditional approaches predominantly address Normal +Outliers, which are activations across all tokens with relatively large +magnitudes. However, these methods struggle with smoothing Massive Outliers +that display significantly larger values, which leads to significant +performance degradation in low-bit quantization. In this paper, we introduce +DuQuant, a novel approach that utilizes rotation and permutation +transformations to more effectively mitigate both massive and normal outliers. +First, DuQuant starts by constructing the rotation matrix, using specific +outlier dimensions as prior knowledge, to redistribute outliers to adjacent +channels by block-wise rotation. Second, We further employ a zigzag permutation +to balance the distribution of outliers across blocks, thereby reducing +block-wise variance. A subsequent rotation further smooths the activation +landscape, enhancing model performance. DuQuant simplifies the quantization +process and excels in managing outliers, outperforming the state-of-the-art +baselines across various sizes and types of LLMs on multiple tasks, even with +4-bit weight-activation quantization. Our code is available at +https://github.com/Hsu1023/DuQuant. + +
+
+ comment: NeurIPS 2024 Oral, Website at https://duquant.github.io +
+
+
+
+
+ + ♻ ☆ Introducing MAPO: Momentum-Aided Gradient Descent Prompt Optimization + + +
+ Momentum-Aided Prompt Optimization (MAPO) enhances the efficiency and +efficacy of prompt optimization for Large Language Models (LLMs). Building on +ProTeGi, MAPO uses positive natural language "gradients" and a momentum-based +extension to refine prompts effectively. By tracking gradient history, MAPO +avoids local minima and oscillations. It also utilizes beam search and an Upper +Confidence Bound (UCB) algorithm for balanced candidate expansion and +selection. Benchmark testing shows that MAPO achieves faster convergence time +with fewer API calls and higher F1 scores than ProTeGi, proving it as a robust +and scalable solution for automated prompt engineering in LLMs. + +
+
+
+
+
+ + ♻ ☆ Can Large Language Model Agents Simulate Human Trust Behavior? NeurIPS 2024 + + +
+ Large Language Model (LLM) agents have been increasingly adopted as +simulation tools to model humans in social science and role-playing +applications. However, one fundamental question remains: can LLM agents really +simulate human behavior? In this paper, we focus on one critical and elemental +behavior in human interactions, trust, and investigate whether LLM agents can +simulate human trust behavior. We first find that LLM agents generally exhibit +trust behavior, referred to as agent trust, under the framework of Trust Games, +which are widely recognized in behavioral economics. Then, we discover that +GPT-4 agents manifest high behavioral alignment with humans in terms of trust +behavior, indicating the feasibility of simulating human trust behavior with +LLM agents. In addition, we probe the biases of agent trust and differences in +agent trust towards other LLM agents and humans. We also explore the intrinsic +properties of agent trust under conditions including external manipulations and +advanced reasoning strategies. Our study provides new insights into the +behaviors of LLM agents and the fundamental analogy between LLMs and humans +beyond value alignment. We further illustrate broader implications of our +discoveries for applications where trust is paramount. + +
+
+ comment: Accepted to Proceedings of NeurIPS 2024. The first two authors + contributed equally. 10 pages for main paper, 56 pages including appendix. + Project website: https://agent-trust.camel-ai.org +
+
+
+
+
+ + ♻ ☆ SelfCodeAlign: Self-Alignment for Code Generation NeurIPS 2024 + + +
+ Instruction tuning is a supervised fine-tuning approach that significantly +improves the ability of large language models (LLMs) to follow human +instructions. We propose SelfCodeAlign, the first fully transparent and +permissive pipeline for self-aligning code LLMs without extensive human +annotations or distillation. SelfCodeAlign employs the same base model for +inference throughout the data generation process. It first extracts diverse +coding concepts from high-quality seed snippets to generate new tasks. It then +samples multiple responses per task, pairs each with test cases, and validates +them in a sandbox environment. Finally, passing examples are selected for +instruction tuning. In our primary experiments, we use SelfCodeAlign with +CodeQwen1.5-7B to generate a dataset of 74k instruction-response pairs. +Finetuning on this dataset leads to a model that achieves a 67.1 pass@1 on +HumanEval+, surpassing CodeLlama-70B-Instruct despite being ten times smaller. +Across all benchmarks, this finetuned model consistently outperforms the +original version trained with OctoPack, the previous state-of-the-art method +for instruction tuning without human annotations or distillation. Additionally, +we show that SelfCodeAlign is effective across LLMs of various sizes, from 3B +to 33B, and that the base models can benefit more from alignment with their own +data distribution. We further validate each component's effectiveness in our +pipeline, showing that SelfCodeAlign outperforms both direct distillation from +GPT-4o and leading GPT-3.5-based distillation methods, such as OSS-Instruct and +Evol-Instruct. SelfCodeAlign has also led to the creation of +StarCoder2-Instruct, the first fully transparent, permissively licensed, and +self-aligned code LLM that achieves state-of-the-art coding performance. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Language Imbalance Driven Rewarding for Multilingual Self-improving + + +
+ Large Language Models (LLMs) have achieved state-of-the-art performance +across numerous tasks. However, these advancements have predominantly benefited +"first-class" languages such as English and Chinese, leaving many other +languages underrepresented. This imbalance, while limiting broader +applications, generates a natural preference ranking between languages, +offering an opportunity to bootstrap the multilingual capabilities of LLM in a +self-improving manner. Thus, we propose $\textit{Language Imbalance Driven +Rewarding}$, where the inherent imbalance between dominant and non-dominant +languages within LLMs is leveraged as a reward signal. Iterative DPO training +demonstrates that this approach not only enhances LLM performance in +non-dominant languages but also improves the dominant language's capacity, +thereby yielding an iterative reward signal. Fine-tuning +Meta-Llama-3-8B-Instruct over two iterations of this approach results in +continuous improvements in multilingual performance across +instruction-following and arithmetic reasoning tasks, evidenced by an average +improvement of 7.46% win rate on the X-AlpacaEval leaderboard and 13.9% +accuracy on the MGSM benchmark. This work serves as an initial exploration, +paving the way for multilingual self-improvement of LLMs. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ LongRAG: A Dual-Perspective Retrieval-Augmented Generation Paradigm for + Long-Context Question Answering EMNLP 2024 + + +
+ Long-Context Question Answering (LCQA), a challenging task, aims to reason +over long-context documents to yield accurate answers to questions. Existing +long-context Large Language Models (LLMs) for LCQA often struggle with the +"lost in the middle" issue. Retrieval-Augmented Generation (RAG) mitigates this +issue by providing external factual evidence. However, its chunking strategy +disrupts the global long-context information, and its low-quality retrieval in +long contexts hinders LLMs from identifying effective factual details due to +substantial noise. To this end, we propose LongRAG, a general, +dual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance +RAG's understanding of complex long-context knowledge (i.e., global information +and factual details). We design LongRAG as a plug-and-play paradigm, +facilitating adaptation to various domains and LLMs. Extensive experiments on +three multi-hop datasets demonstrate that LongRAG significantly outperforms +long-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG +(up by 17.25%). Furthermore, we conduct quantitative ablation studies and +multi-dimensional analyses, highlighting the effectiveness of the system's +components and fine-tuning strategies. Data and code are available at +https://github.com/QingFei1/LongRAG. + +
+
+ comment: EMNLP 2024 Main, Final +
+
+
+
+
+ + ♻ ☆ XC-Cache: Cross-Attending to Cached Context for Efficient LLM Inference + + +
+ In-context learning (ICL) approaches typically leverage prompting to +condition decoder-only language model generation on reference information. +Just-in-time processing of a context is inefficient due to the quadratic cost +of self-attention operations, and caching is desirable. However, caching +transformer states can easily require almost as much space as the model +parameters. When the right context isn't known in advance, caching ICL can be +challenging. This work addresses these limitations by introducing models that, +inspired by the encoder-decoder architecture, use cross-attention to condition +generation on reference text without the prompt. More precisely, we leverage +pre-trained decoder-only models and only train a small number of added layers. +We use Question-Answering (QA) as a testbed to evaluate the ability of our +models to perform conditional generation and observe that they outperform ICL, +are comparable to fine-tuned prompted LLMs, and drastically reduce the space +footprint relative to standard KV caching by two orders of magnitude. + +
+
+
+
+
+ + ♻ ☆ INC-Math: Integrating Natural Language and Code for Enhanced + Mathematical Reasoning in Large Language Models + + +
+ Large Language Models (LLMs) are commonly used to generate solutions for +mathematical reasoning problems in the following formats: natural language, +code, or a combination of both. In this paper, we explore fundamental questions +related to solving mathematical reasoning problems using natural language and +code with state-of-the-art LLMs, including GPT-4o-mini and LLama-3.1-8b-Turbo. +Our findings show that LLMs are better at reasoning in natural language +compared to code. Additionally, although natural language and code serve as +complementary forms of reasoning, they can affect each other in a negative way +in certain scenarios. These insights motivate our development of a new +prompting method, INC-Math, which leverages an LLM to dynamically select the +most appropriate reasoning form, resulting in improved performance over +comparable baselines with GPT-4o-mini. + +
+
+
+
+
+ + ♻ ☆ Nova: A Practical and Advanced Alignment + + +
+ We introduce Nova, a suite of practical alignment techniques employed in a +series of empirically validated high-performing models. This represents the +first comprehensive account of alignment methodologies, offering valuable +insights for advancing AI research. We investigate the critical components that +enhance model performance during the alignment process, including optimization +methods, data strategies, capability enhancements, and evaluation processes. +The process spans three key stages: Prompt Augmentation System(PAS), Supervised +Fine-Tuning(SFT), and Preference Alignment. The problems encountered, the +solutions applied, and the improvements made are thoroughly recorded. + Through comparisons across well-established benchmarks, we highlight the +technological advancements enabled by Nova Alignment. Importantly, +Qwen2-Nova-72B and Llama3-PBM-Nova-70B are instruct versions of the Qwen2-72B +and Llama-3-70B base models, optimized through Nova. The Nova models show +significant core improvements, with user experience gains of 17% to 28%, and +excels on specialized benchmarks. In open-source benchmark evaluations, both +Qwen2-Nova-72B and Llama3-PBM-Nova-70B consistently outperform their respective +official instruct versions across nearly all datasets. This report aims to +clarify the key technologies behind the alignment process, fostering a deeper +understanding within the community. Llama3-PBM-Nova-70B model is available at +https://huggingface.co/PKU-Baichuan-MLSystemLab/Llama3-PBM-Nova-70B. + +
+
+
+
+
+ + ♻ ☆ TaskBench: Benchmarking Large Language Models for Task Automation NeurIPS 2024 + + +
+ In recent years, the remarkable progress of large language models (LLMs) has +sparked interest in task automation, which involves decomposing complex tasks +described by user instructions into sub-tasks and invoking external tools to +execute them, playing a central role in autonomous agents. However, there is a +lack of systematic and standardized benchmarks to promote the development of +LLMs in task automation. To address this, we introduce TaskBench, a +comprehensive framework to evaluate the capability of LLMs in task automation. +Specifically, task automation can be divided into three critical stages: task +decomposition, tool selection, and parameter prediction. To tackle the +complexities inherent in these stages, we introduce the concept of Tool Graph +to represent decomposed tasks and adopt a back-instruct method to generate +high-quality user instructions. We propose TaskEval, a multi-faceted evaluation +methodology that assesses LLM performance across these three stages. Our +approach combines automated construction with rigorous human verification, +ensuring high consistency with human evaluation. Experimental results +demonstrate that TaskBench effectively reflects the capabilities of various +LLMs in task automation. It provides insights into model performance across +different task complexities and domains, pushing the boundaries of what current +models can achieve. TaskBench offers a scalable, adaptable, and reliable +benchmark for advancing LLM-based autonomous agents. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Models for Patient Comments Multi-Label Classification + + +
+ Patient experience and care quality are crucial for a hospital's +sustainability and reputation. The analysis of patient feedback offers valuable +insight into patient satisfaction and outcomes. However, the unstructured +nature of these comments poses challenges for traditional machine learning +methods following a supervised learning paradigm. This is due to the +unavailability of labeled data and the nuances these texts encompass. This +research explores leveraging Large Language Models (LLMs) in conducting +Multi-label Text Classification (MLTC) of inpatient comments shared after a +stay in the hospital. GPT-4 Turbo was leveraged to conduct the classification. +However, given the sensitive nature of patients' comments, a security layer is +introduced before feeding the data to the LLM through a Protected Health +Information (PHI) detection framework, which ensures patients' +de-identification. Additionally, using the prompt engineering framework, +zero-shot learning, in-context learning, and chain-of-thought prompting were +experimented with. Results demonstrate that GPT-4 Turbo, whether following a +zero-shot or few-shot setting, outperforms traditional methods and Pre-trained +Language Models (PLMs) and achieves the highest overall performance with an +F1-score of 76.12% and a weighted F1-score of 73.61% followed closely by the +few-shot learning results. Subsequently, the results' association with other +patient experience structured variables (e.g., rating) was conducted. The study +enhances MLTC through the application of LLMs, offering healthcare +practitioners an efficient method to gain deeper insights into patient feedback +and deliver prompt, appropriate responses. + +
+
+
+
+
+ + ♻ ☆ $\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific + Domain through Complementary Granularity EMNLP 2024 + + +
+ Recent studies show the growing significance of document retrieval in the +generation of LLMs, i.e., RAG, within the scientific domain by bridging their +knowledge gap. However, dense retrievers often struggle with domain-specific +retrieval and complex query-document relationships, particularly when query +segments correspond to various parts of a document. To alleviate such prevalent +challenges, this paper introduces $\texttt{MixGR}$, which improves dense +retrievers' awareness of query-document matching across various levels of +granularity in queries and documents using a zero-shot approach. +$\texttt{MixGR}$ fuses various metrics based on these granularities to a united +score that reflects a comprehensive query-document similarity. Our experiments +demonstrate that $\texttt{MixGR}$ outperforms previous document retrieval by +24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based +retrievers, respectively, averaged on queries containing multiple subqueries +from five scientific retrieval datasets. Moreover, the efficacy of two +downstream scientific question-answering tasks highlights the advantage of +$\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The +code and experimental datasets are available. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Generative Pre-trained Speech Language Model with Efficient Hierarchical + Transformer ACL2024 + + +
+ While recent advancements in speech language models have achieved significant +progress, they face remarkable challenges in modeling the long acoustic +sequences of neural audio codecs. In this paper, we introduce +\textbf{G}enerative \textbf{P}re-trained \textbf{S}peech \textbf{T}ransformer +(GPST), a hierarchical transformer designed for efficient speech language +modeling. GPST quantizes audio waveforms into two distinct types of discrete +speech representations and integrates them within a hierarchical transformer +architecture, allowing for a unified one-stage generation process and enhancing +Hi-Res audio generation capabilities. By training on large corpora of speeches +in an end-to-end unsupervised manner, GPST can generate syntactically +consistent speech with diverse speaker identities. Given a brief 3-second +prompt, GPST can produce natural and coherent personalized speech, +demonstrating in-context learning abilities. Moreover, our approach can be +easily extended to spoken cross-lingual speech generation by incorporating +multi-lingual semantic tokens and universal acoustic tokens. Experimental +results indicate that GPST significantly outperforms the existing speech +language models in terms of word error rate, speech quality, and speaker +similarity. The code is available at \url{https://github.com/youngsheen/GPST}. + +
+
+ comment: Accept in ACL2024-main +
+
+
+
+
+ + ♻ ☆ Semantic Density: Uncertainty Quantification for Large Language Models + through Confidence Measurement in Semantic Space + + +
+ With the widespread application of Large Language Models (LLMs) to various +domains, concerns regarding the trustworthiness of LLMs in safety-critical +scenarios have been raised, due to their unpredictable tendency to hallucinate +and generate misinformation. Existing LLMs do not have an inherent +functionality to provide the users with an uncertainty/confidence metric for +each response it generates, making it difficult to evaluate trustworthiness. +Although several studies aim to develop uncertainty quantification methods for +LLMs, they have fundamental limitations, such as being restricted to +classification tasks, requiring additional training and data, considering only +lexical instead of semantic information, and being prompt-wise but not +response-wise. A new framework is proposed in this paper to address these +issues. Semantic density extracts uncertainty/confidence information for each +response from a probability distribution perspective in semantic space. It has +no restriction on task types and is "off-the-shelf" for new models and tasks. +Experiments on seven state-of-the-art LLMs, including the latest Llama 3 and +Mixtral-8x22B models, on four free-form question-answering benchmarks +demonstrate the superior performance and robustness of semantic density +compared to prior approaches. + +
+
+ comment: Accepted to Neurips 2024 +
+
+
+
+
+ + ♻ ☆ LaCour!: Enabling Research on Argumentation in Hearings of the European + Court of Human Rights + + +
+ Why does an argument end up in the final court decision? Was it deliberated +or questioned during the oral hearings? Was there something in the hearings +that triggered a particular judge to write a dissenting opinion? Despite the +availability of the final judgments of the European Court of Human Rights +(ECHR), none of these legal research questions can currently be answered as the +ECHR's multilingual oral hearings are not transcribed, structured, or +speaker-attributed. We address this fundamental gap by presenting LaCour!, the +first corpus of textual oral arguments of the ECHR, consisting of 154 full +hearings (2.1 million tokens from over 267 hours of video footage) in English, +French, and other court languages, each linked to the corresponding final +judgment documents. In addition to the transcribed and partially manually +corrected text from the video, we provide sentence-level timestamps and +manually annotated role and language labels. We also showcase LaCour! in a set +of preliminary experiments that explore the interplay between questions and +dissenting opinions. Apart from the use cases in legal NLP, we hope that law +students or other interested parties will also use LaCour! as a learning +resource, as it is freely available in various formats at +https://huggingface.co/datasets/TrustHLT/LaCour. + +
+
+
+
+
+ + ♻ ☆ Multi-Agent Large Language Models for Conversational Task-Solving + + +
+ In an era where single large language models have dominated the landscape of +artificial intelligence for years, multi-agent systems arise as new +protagonists in conversational task-solving. While previous studies have +showcased their potential in reasoning tasks and creative endeavors, an +analysis of their limitations concerning the conversational paradigms and the +impact of individual agents is missing. It remains unascertained how +multi-agent discussions perform across tasks of varying complexity and how the +structure of these conversations influences the process. To fill that gap, this +work systematically evaluates multi-agent systems across various discussion +paradigms, assessing their strengths and weaknesses in both generative tasks +and question-answering tasks. Alongside the experiments, I propose a taxonomy +of 20 multi-agent research studies from 2022 to 2024, followed by the +introduction of a framework for deploying multi-agent LLMs in conversational +task-solving. I demonstrate that while multi-agent systems excel in complex +reasoning tasks, outperforming a single model by leveraging expert personas, +they fail on basic tasks. Concretely, I identify three challenges that arise: +1) While longer discussions enhance reasoning, agents fail to maintain +conformity to strict task requirements, which leads to problem drift, making +shorter conversations more effective for basic tasks. 2) Prolonged discussions +risk alignment collapse, raising new safety concerns for these systems. 3) I +showcase discussion monopolization through long generations, posing the problem +of fairness in decision-making for tasks like summarization. This work uncovers +both the potential and challenges that arise with multi-agent interaction and +varying conversational paradigms, providing insights into how future research +could improve the efficiency, performance, and safety of multi-agent LLMs. + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models, +especially for long video understanding. We introduce LongVILA, a full-stack +solution for long-context visual-language models by co-designing the algorithm +and system. For model training, we upgrade existing VLMs to support long video +understanding by incorporating two additional stages, i.e., long context +extension and long video supervised fine-tuning. However, training on long +video is computationally and memory intensive. We introduce the long-context +Multi-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes +long video training and inference, enabling 2M context length training on 256 +GPUs without any gradient checkpointing. LongVILA efficiently extends the +number of video frames of VILA from 8 to 2048, improving the long video +captioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in +6,000-frame (more than 1 million tokens) video needle-in-a-haystack. +LongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8% +with subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence +parallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and +tensor parallelism. Moreover, it seamlessly integrates with Hugging Face +Transformers. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models with Human Opinions through Persona + Selection and Value--Belief--Norm Reasoning + + +
+ Reasoning and predicting human opinions with large language models (LLMs) is +essential yet challenging. Current methods employ role-playing with personae +but face two major issues: LLMs are sensitive to even a single irrelevant +persona, skewing predictions by up to 30%, and LLMs fail to reason +strategically over personae. We propose Chain-of-Opinion (COO), a simple +four-step solution modeling which and how to reason with personae, inspired by +the Value--Belief--Norm (VBN) theory. COO differentiates between explicit +personae (demographics and ideology) and implicit personae (historical +opinions), involves: (1) filtering irrelevant attributes from explicit +personae, (2) ranking implicit personae into a preferential list for selecting +top-k, (3) applying novel VBN reasoning to extract user environmental and +personal value, belief, and norm variables for accurate and reliable +predictions, and (4) iterating VBN reasoning with progressively larger lists of +implicit personae to handle potential persona insufficiency. COO efficiently +achieves new state-of-the-art opinion prediction via prompting with only 5 +inference calls, improving prior techniques by up to 4%. Notably, fine-tuning +LMs with COO data results in significantly better opinion-aligned models, by up +to 23%. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Du-IN: Discrete units-guided mask modeling for decoding speech from + Intracranial Neural signals + + +
+ Invasive brain-computer interfaces with Electrocorticography (ECoG) have +shown promise for high-performance speech decoding in medical applications, but +less damaging methods like intracranial stereo-electroencephalography (sEEG) +remain underexplored. With rapid advances in representation learning, +leveraging abundant recordings to enhance speech decoding is increasingly +attractive. However, popular methods often pre-train temporal models based on +brain-level tokens, overlooking that brain activities in different regions are +highly desynchronized during tasks. Alternatively, they pre-train +spatial-temporal models based on channel-level tokens but fail to evaluate them +on challenging tasks like speech decoding, which requires intricate processing +in specific language-related areas. To address this issue, we collected a +well-annotated Chinese word-reading sEEG dataset targeting language-related +brain networks from 12 subjects. Using this benchmark, we developed the Du-IN +model, which extracts contextual embeddings based on region-level tokens +through discrete codex-guided mask modeling. Our model achieves +state-of-the-art performance on the 61-word classification task, surpassing all +baselines. Model comparisons and ablation studies reveal that our design +choices, including (i) temporal modeling based on region-level tokens by +utilizing 1D depthwise convolution to fuse channels in the ventral sensorimotor +cortex (vSMC) and superior temporal gyrus (STG) and (ii) self-supervision +through discrete codex-guided mask modeling, significantly contribute to this +performance. Overall, our approach -- inspired by neuroscience findings and +capitalizing on region-level representations from specific brain regions -- is +suitable for invasive brain modeling and represents a promising neuro-inspired +AI approach in brain-computer interfaces. + +
+
+
+
+
+ + ♻ ☆ A Systematic Survey on Large Language Models for Algorithm Design + + +
+ Algorithm Design (AD) is crucial for effective problem-solving across various +domains. The advent of Large Language Models (LLMs) has notably enhanced the +automation and innovation within this field, offering new perspectives and +promising solutions. Over the past three years, the integration of LLMs into AD +(LLM4AD) has seen substantial progress, with applications spanning +optimization, machine learning, mathematical reasoning, and scientific +discovery. Given the rapid advancements and expanding scope of this field, a +systematic review is both timely and necessary. This paper provides a +systematic review of LLM4AD. First, we offer an overview and summary of +existing studies. Then, we introduce a taxonomy and review the literature +across four dimensions: the roles of LLMs, search methods, prompt methods, and +application domains with a discussion of potential and achievements of LLMs in +AD. Finally, we identify current challenges and highlight several promising +directions for future research. + +
+
+
+
+
+ + ♻ ☆ Shortcut-connected Expert Parallelism for Accelerating + Mixture-of-Experts + + +
+ Expert parallelism has been introduced as a strategy to distribute the +computational workload of sparsely-gated mixture-of-experts (MoE) models across +multiple computing devices, facilitating the execution of these increasingly +large-scale models. However, the All-to-All communication intrinsic to expert +parallelism constitutes a significant overhead, diminishing the MoE models' +efficiency. Current optimization approaches offer some relief, yet they are +constrained by the sequential interdependence of communication and computation +operations. To address this limitation, we present a novel shortcut-connected +MoE (ScMoE) architecture with an overlapping parallel strategy, which +effectively decouples communication from its conventional sequence, allowing +for a substantial overlap of 70% to 100% with computation. When compared with +the prevalent top-2 MoE architecture, ScMoE demonstrates training speed +improvements of 30% and 11%, and inference improvements of 40% and 15%, in our +distributed environments with PCIe and NVLink hardware, respectively, where +communication constitutes 60% and 15% of the total MoE time consumption. +Building on the ScMoE architecture, we further implement an expert offloading +strategy to facilitate memory-limited inference, optimizing latency through the +overlap of expert migration. Additionally, extensive experiments and +theoretical analyses indicate that ScMoE not only achieves comparable but in +some instances surpasses the model quality of existing approaches. + +
+
+
+
+
+ + ♻ ☆ Block Transformer: Global-to-Local Language Modeling for Fast Inference + + +
+ We introduce the Block Transformer which adopts hierarchical global-to-local +modeling to autoregressive transformers to mitigate the inference bottlenecks +associated with self-attention. Self-attention requires the key-value (KV) +cache of all previous sequences to be retrieved from memory at every decoding +step to retrieve context information, leading to two primary bottlenecks during +batch inference. First, there is a significant delay in obtaining the first +token, as the information of the entire prompt must first be processed to +prefill the KV cache. Second, computation of subsequent tokens is bottlenecked +by the high memory I/O demand of fetching the entire KV cache, which grows +linearly with sequence length, incurring quadratic memory reads overall. We +design the Block Transformer to strategically mitigate these costs, by +incorporating coarsity and locality into an integrated global-to-local +architecture. At the lower layers, we aggregate tokens into fixed size blocks +to apply attention across the entire sequence at coarse-grained detail, to +capture the global context while minimizing KV cache overhead. At upper layers, +we apply attention within each block to decode individual tokens, to model +fine-grained details with a lightweight local KV cache. We pretrain vanilla and +Block Transformers from scratch and demonstrate that Block Transformers reach +10--20x inference throughput compared to vanilla transformers with equivalent +perplexity and zero-shot task performance. Code is available at +https://github.com/itsnamgyu/block-transformer. + +
+
+ comment: 37 pages, 24 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Towards Robust Multimodal Sentiment Analysis with Incomplete Data NeurIPS 2024 + + +
+ The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an +emerging direction seeking to tackle the issue of data incompleteness. +Recognizing that the language modality typically contains dense sentiment +information, we consider it as the dominant modality and present an innovative +Language-dominated Noise-resistant Learning Network (LNLN) to achieve robust +MSA. The proposed LNLN features a dominant modality correction (DMC) module and +dominant modality based multimodal learning (DMML) module, which enhances the +model's robustness across various noise scenarios by ensuring the quality of +dominant modality representations. Aside from the methodical design, we perform +comprehensive experiments under random data missing scenarios, utilizing +diverse and meaningful settings on several popular datasets (\textit{e.g.,} +MOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and +fairness compared to existing evaluations in the literature. Empirically, LNLN +consistently outperforms existing baselines, demonstrating superior performance +across these challenging and extensive evaluation metrics. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ MAPLE: Mobile App Prediction Leveraging Large Language Model Embeddings + + +
+ In recent years, predicting mobile app usage has become increasingly +important for areas like app recommendation, user behaviour analysis, and +mobile resource management. Existing models, however, struggle with the +heterogeneous nature of contextual data and the user cold start problem. This +study introduces a novel prediction model, Mobile App Prediction Leveraging +Large Language Model Embeddings (MAPLE), which employs Large Language Models +(LLMs) and installed app similarity to overcome these challenges. MAPLE +utilises the power of LLMs to process contextual data and discern intricate +relationships within it effectively. Additionally, we explore the use of +installed app similarity to address the cold start problem, facilitating the +modelling of user preferences and habits, even for new users with limited +historical data. In essence, our research presents MAPLE as a novel, potent, +and practical approach to app usage prediction, making significant strides in +resolving issues faced by existing models. MAPLE stands out as a comprehensive +and effective solution, setting a new benchmark for more precise and +personalised app usage predictions. In tests on two real-world datasets, MAPLE +surpasses contemporary models in both standard and cold start scenarios. These +outcomes validate MAPLE's capacity for precise app usage predictions and its +resilience against the cold start problem. This enhanced performance stems from +the model's proficiency in capturing complex temporal patterns and leveraging +contextual information. As a result, MAPLE can potentially improve personalised +mobile app usage predictions and user experiences markedly. + +
+
+
+
+
+ + ♻ ☆ $FastDoc$: Domain-Specific Fast Continual Pre-training Technique using + Document-Level Metadata and Taxonomy + + +
+ In this paper, we propose $FastDoc$ (Fast Continual Pre-training Technique +using Document Level Metadata and Taxonomy), a novel, compute-efficient +framework that utilizes Document metadata and Domain-Specific Taxonomy as +supervision signals to continually pre-train transformer encoder on a +domain-specific corpus. The main innovation is that during domain-specific +pretraining, an open-domain encoder is continually pre-trained using +sentence-level embeddings as inputs (to accommodate long documents), however, +fine-tuning is done with token-level embeddings as inputs to this encoder. We +perform such domain-specific pre-training on three different domains namely +customer support, scientific, and legal domains, and compare performance on 6 +different downstream tasks and 9 different datasets. The novel use of +document-level supervision along with sentence-level embedding input for +pre-training reduces pre-training compute by around $1,000$, $4,500$, and $500$ +times compared to MLM and/or NSP in Customer Support, Scientific, and Legal +Domains, respectively. The reduced training time does not lead to a +deterioration in performance. In fact we show that $FastDoc$ either outperforms +or performs on par with several competitive transformer-based baselines in +terms of character-level F1 scores and other automated metrics in the Customer +Support, Scientific, and Legal Domains. Moreover, reduced training aids in +mitigating the risk of catastrophic forgetting. Thus, unlike baselines, +$FastDoc$ shows a negligible drop in performance on open domain. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR), 36 + pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Adversarial Representation Engineering: A General Model Editing + Framework for Large Language Models NeurIPS 2024 + + +
+ Since the rapid development of Large Language Models (LLMs) has achieved +remarkable success, understanding and rectifying their internal complex +mechanisms has become an urgent issue. Recent research has attempted to +interpret their behaviors through the lens of inner representation. However, +developing practical and efficient methods for applying these representations +for general and flexible model editing remains challenging. In this work, we +explore how to leverage insights from representation engineering to guide the +editing of LLMs by deploying a representation sensor as an editing oracle. We +first identify the importance of a robust and reliable sensor during editing, +then propose an Adversarial Representation Engineering (ARE) framework to +provide a unified and interpretable approach for conceptual model editing +without compromising baseline performance. Experiments on multiple tasks +demonstrate the effectiveness of ARE in various model editing scenarios. Our +code and data are available at +https://github.com/Zhang-Yihao/Adversarial-Representation-Engineering. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ Revisiting the Impact of Pursuing Modularity for Code Generation EMNLP 2024 + + +
+ Modular programming, which aims to construct the final program by integrating +smaller, independent building blocks, has been regarded as a desirable practice +in software development. However, with the rise of recent code generation +agents built upon large language models (LLMs), a question emerges: is this +traditional practice equally effective for these new tools? In this work, we +assess the impact of modularity in code generation by introducing a novel +metric for its quantitative measurement. Surprisingly, unlike conventional +wisdom on the topic, we find that modularity is not a core factor for improving +the performance of code generation models. We also explore potential +explanations for why LLMs do not exhibit a preference for modular code compared +to non-modular code. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Towards Faithful Natural Language Explanations: A Study Using Activation + Patching in Large Language Models + + +
+ Large Language Models (LLMs) are capable of generating persuasive Natural +Language Explanations (NLEs) to justify their answers. However, the +faithfulness of these explanations should not be readily trusted at face value. +Recent studies have proposed various methods to measure the faithfulness of +NLEs, typically by inserting perturbations at the explanation or feature level. +We argue that these approaches are neither comprehensive nor correctly designed +according to the established definition of faithfulness. Moreover, we highlight +the risks of grounding faithfulness findings on out-of-distribution samples. In +this work, we leverage a causal mediation technique called activation patching, +to measure the faithfulness of an explanation towards supporting the explained +answer. Our proposed metric, Causal Faithfulness quantifies the consistency of +causal attributions between explanations and the corresponding model outputs as +the indicator of faithfulness. We experimented across models varying from 2B to +27B parameters and found that models that underwent alignment tuning tend to +produce more faithful and plausible explanations. We find that Causal +Faithfulness is a promising improvement over existing faithfulness tests by +taking into account the model's internal computations and avoiding out of +distribution concerns that could otherwise undermine the validity of +faithfulness assessments. We release the code in +\url{https://github.com/wj210/Causal-Faithfulness} + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ AutoManual: Generating Instruction Manuals by LLM Agents via Interactive + Environmental Learning NeurIPS 2024 + + +
+ Large Language Models (LLM) based agents have shown promise in autonomously +completing tasks across various domains, e.g., robotics, games, and web +navigation. However, these agents typically require elaborate design and expert +prompts to solve tasks in specific domains, which limits their adaptability. We +introduce AutoManual, a framework enabling LLM agents to autonomously build +their understanding through interaction and adapt to new environments. +AutoManual categorizes environmental knowledge into diverse rules and optimizes +them in an online fashion by two agents: 1) The Planner codes actionable plans +based on current rules for interacting with the environment. 2) The Builder +updates the rules through a well-structured rule system that facilitates online +rule management and essential detail retention. To mitigate hallucinations in +managing rules, we introduce a *case-conditioned prompting* strategy for the +Builder. Finally, the Formulator agent compiles these rules into a +comprehensive manual. The self-generated manual can not only improve the +adaptability but also guide the planning of smaller LLMs while being +human-readable. Given only one simple demonstration, AutoManual significantly +improves task success rates, achieving 97.4\% with GPT-4-turbo and 86.2\% with +GPT-3.5-turbo on ALFWorld benchmark tasks. The code is available at +https://github.com/minghchen/automanual. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ In-Context Transfer Learning: Demonstration Synthesis by Transferring + Similar Tasks + + +
+ In-context learning (ICL) is an effective approach to help large language +models (LLMs) adapt to various tasks by providing demonstrations of the target +task. Considering the high cost of labeling demonstrations, many methods +propose synthesizing demonstrations from scratch using LLMs. However, the +quality of the demonstrations synthesized from scratch is limited by the +capabilities and knowledge of LLMs. To address this, inspired by transfer +learning, we propose In-Context Transfer Learning (ICTL), which synthesizes +target task demonstrations by transferring labeled demonstrations from similar +source tasks. ICTL consists of two steps: source sampling and target transfer. +First, we define an optimization objective, which minimizes transfer error to +sample source demonstrations similar to the target task. Then, we employ LLMs +to transfer the sampled source demonstrations to the target task, matching the +definition and format of the target task. Experiments on Super-NI show that +ICTL outperforms synthesis from scratch by 2.0% on average, demonstrating the +effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Lightweight Transducer Based on Frame-Level Criterion + + +
+ The transducer model trained based on sequence-level criterion requires a lot +of memory due to the generation of the large probability matrix. We proposed a +lightweight transducer model based on frame-level criterion, which uses the +results of the CTC forced alignment algorithm to determine the label for each +frame. Then the encoder output can be combined with the decoder output at the +corresponding time, rather than adding each element output by the encoder to +each element output by the decoder as in the transducer. This significantly +reduces memory and computation requirements. To address the problem of +imbalanced classification caused by excessive blanks in the label, we decouple +the blank and non-blank probabilities and truncate the gradient of the blank +classifier to the main network. Experiments on the AISHELL-1 demonstrate that +this enables the lightweight transducer to achieve similar results to +transducer. Additionally, we use richer information to predict the probability +of blank, achieving superior results to transducer. + +
+
+ comment: Accepted by Interspeech 2024, code repository: + https://github.com/wangmengzhi/Lightweight-Transducer +
+
+
+
+
+ + ♻ ☆ CRAG -- Comprehensive RAG Benchmark NeurIPS 2024 + + +
+ Retrieval-Augmented Generation (RAG) has recently emerged as a promising +solution to alleviate Large Language Model (LLM)'s deficiency in lack of +knowledge. Existing RAG datasets, however, do not adequately represent the +diverse and dynamic nature of real-world Question Answering (QA) tasks. To +bridge this gap, we introduce the Comprehensive RAG Benchmark (CRAG), a factual +question answering benchmark of 4,409 question-answer pairs and mock APIs to +simulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a +diverse array of questions across five domains and eight question categories, +reflecting varied entity popularity from popular to long-tail, and temporal +dynamisms ranging from years to seconds. Our evaluation of this benchmark +highlights the gap to fully trustworthy QA. Whereas most advanced LLMs achieve +<=34% accuracy on CRAG, adding RAG in a straightforward manner improves the +accuracy only to 44%. State-of-the-art industry RAG solutions only answer 63% +of questions without any hallucination. CRAG also reveals much lower accuracy +in answering questions regarding facts with higher dynamism, lower popularity, +or higher complexity, suggesting future research directions. The CRAG benchmark +laid the groundwork for a KDD Cup 2024 challenge and attracted thousands of +participants and submissions. We commit to maintaining CRAG to serve research +communities in advancing RAG solutions and general QA solutions. CRAG is +available at https://github.com/facebookresearch/CRAG/. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ TART: An Open-Source Tool-Augmented Framework for Explainable + Table-based Reasoning + + +
+ Current Large Language Models (LLMs) exhibit limited ability to understand +table structures and to apply precise numerical reasoning, which is crucial for +tasks such as table question answering (TQA) and table-based fact verification +(TFV). To address these challenges, we introduce our Tool-Augmented Reasoning +framework for Tables (TART), which integrates LLMs with specialized tools. TART +contains three key components: a table formatter to ensure accurate data +representation, a tool maker to develop specific computational tools, and an +explanation generator to maintain explainability. We also present the TOOLTAB +dataset, a new benchmark designed specifically for training LLMs in table-tool +integration. Our experiments indicate that TART achieves substantial +improvements over existing methods (e.g., Chain-of-Thought) by improving both +the precision of data processing and the clarity of the reasoning process. +Notably, TART paired with CodeLlama achieves 90.0% of the accuracy of the +closed-sourced LLM GPT-3.5-turbo, highlighting its robustness in diverse +real-world scenarios. All the code and data are available at +https://github.com/XinyuanLu00/TART. + +
+
+ comment: technical report +
+
+
+
+
+ + ♻ ☆ When Large Language Models Meet Vector Databases: A Survey + + +
+ This survey explores the synergistic potential of Large Language Models +(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving +research area. With the proliferation of LLMs comes a host of challenges, +including hallucinations, outdated knowledge, prohibitive commercial +application costs, and memory issues. VecDBs emerge as a compelling solution to +these issues by offering an efficient means to store, retrieve, and manage the +high-dimensional vector representations intrinsic to LLM operations. Through +this nuanced review, we delineate the foundational principles of LLMs and +VecDBs and critically analyze their integration's impact on enhancing LLM +functionalities. This discourse extends into a discussion on the speculative +future developments in this domain, aiming to catalyze further research into +optimizing the confluence of LLMs and VecDBs for advanced data handling and +knowledge extraction capabilities. + +
+
+
+
+
+ + ♻ ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Achieving the effective design and improvement of reward functions in +reinforcement learning (RL) tasks with complex custom environments and multiple +requirements presents considerable challenges. In this paper, we propose ERFSL, +an efficient reward function searcher using LLMs, which enables LLMs to be +effective white-box searchers and highlights their advanced semantic +understanding capabilities. Specifically, we generate reward components for +each numerically explicit user requirement and employ a reward critic to +identify the correct code form. Then, LLMs assign weights to the reward +components to balance their values and iteratively adjust the weights without +ambiguity and redundant adjustments by flexibly adopting directional mutation +and crossover strategies, similar to genetic algorithms, based on the context +provided by the training log analyzer. We applied the framework to an +underwater data collection RL task without direct human feedback or reward +examples (zero-shot learning). The reward critic successfully corrects the +reward code with only one feedback instance for each requirement, effectively +preventing unrectifiable errors. The initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without the need for weight search. Even in cases where a weight is 500 times +off, on average, only 5.2 iterations are needed to meet user requirements. The +ERFSL also works well with most prompts utilizing GPT-4o mini, as we decompose +the weight searching process to reduce the requirement for numerical and +long-context understanding capabilities + +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Suicide Detection on Social Media + with Limited Labels + + +
+ The increasing frequency of suicidal thoughts highlights the importance of +early detection and intervention. Social media platforms, where users often +share personal experiences and seek help, could be utilized to identify +individuals at risk. However, the large volume of daily posts makes manual +review impractical. This paper explores the use of Large Language Models (LLMs) +to automatically detect suicidal content in text-based social media posts. We +propose a novel method for generating pseudo-labels for unlabeled data by +prompting LLMs, along with traditional classification fine-tuning techniques to +enhance label accuracy. To create a strong suicide detection model, we develop +an ensemble approach involving prompting with Qwen2-72B-Instruct, and using +fine-tuned models such as Llama3-8B, Llama3.1-8B, and Gemma2-9B. We evaluate +our approach on the dataset of the Suicide Ideation Detection on Social Media +Challenge, a track of the IEEE Big Data 2024 Big Data Cup. Additionally, we +conduct a comprehensive analysis to assess the impact of different models and +fine-tuning strategies on detection performance. Experimental results show that +the ensemble model significantly improves the detection accuracy, by 5% points +compared with the individual models. It achieves a weight F1 score of 0.770 on +the public test set, and 0.731 on the private test set, providing a promising +solution for identifying suicidal content in social media. Our analysis shows +that the choice of LLMs affects the prompting performance, with larger models +providing better accuracy. Our code and checkpoints are publicly available at +https://github.com/khanhvynguyen/Suicide_Detection_LLMs. + +
+
+ comment: Accepted at IEEE International Conference on Big Data 2024 +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Fine-Tuning in Large Models: A Survey of + Methodologies + + +
+ The large models, as predicted by scaling raw forecasts, have made +groundbreaking progress in many fields, particularly in natural language +generation tasks, where they have approached or even surpassed human levels. +However, the unprecedented scale of their parameters brings significant +computational and storage costs. These large models require substantial +computational resources and GPU memory to operate. When adapting large models +to specific downstream tasks, their massive parameter scale poses a significant +challenge in fine-tuning on hardware platforms with limited computational power +and GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT) +offers a practical solution by efficiently adjusting the parameters of large +pre-trained models to suit various downstream tasks. Specifically, PEFT adjusts +the parameters of pre-trained large models to adapt to specific tasks or +domains, minimizing the introduction of additional parameters and the +computational resources required. This review mainly introduces the preliminary +knowledge of PEFT, the core ideas and principles of various PEFT algorithms, +the applications of PEFT, and potential future research directions. By reading +this review, we believe that interested parties can quickly grasp the PEFT +methodology, thereby accelerating its development and innovation. + +
+
+
+
+
+ + ♻ ☆ Channel-Wise Mixed-Precision Quantization for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable success across a +wide range of language tasks, but their deployment on edge devices remains +challenging due to the substantial memory requirements imposed by their large +parameter sizes. Weight-only quantization presents a promising solution to +reduce the memory footprint of LLMs. However, existing approaches primarily +focus on integer-bit quantization, limiting their adaptability to +fractional-bit quantization tasks and preventing the full utilization of +available storage space on devices. In this paper, we introduce Channel-Wise +Mixed-Precision Quantization (CMPQ), a novel mixed-precision quantization +method that allocates quantization precision in a channel-wise pattern based on +activation distributions. By assigning different precision levels to different +weight channels, CMPQ can adapt to any bit-width constraint. CMPQ employs a +non-uniform quantization strategy and incorporates two outlier extraction +techniques that collaboratively preserve the critical information, thereby +minimizing the quantization loss. Experiments on different sizes of LLMs +demonstrate that CMPQ not only enhances performance in integer-bit quantization +tasks but also achieves significant performance gains with a modest increase in +memory usage. CMPQ thus represents an adaptive and effective approach to LLM +quantization, offering substantial benefits across diverse device capabilities. + +
+
+
+
+
+ + ♻ ☆ LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential + Recommendation + + +
+ Sequential recommender systems (SRS) aim to predict users' subsequent choices +based on their historical interactions and have found applications in diverse +fields such as e-commerce and social media. However, in real-world systems, +most users interact with only a handful of items, while the majority of items +are seldom consumed. These two issues, known as the long-tail user and +long-tail item challenges, often pose difficulties for existing SRS. These +challenges can adversely affect user experience and seller benefits, making +them crucial to address. Though a few works have addressed the challenges, they +still struggle with the seesaw or noisy issues due to the intrinsic scarcity of +interactions. The advancements in large language models (LLMs) present a +promising solution to these problems from a semantic perspective. As one of the +pioneers in this field, we propose the Large Language Models Enhancement +framework for Sequential Recommendation (LLM-ESR). This framework utilizes +semantic embeddings derived from LLMs to enhance SRS without adding extra +inference load from LLMs. To address the long-tail item challenge, we design a +dual-view modeling framework that combines semantics from LLMs and +collaborative signals from conventional SRS. For the long-tail user challenge, +we propose a retrieval augmented self-distillation method to enhance user +preference representation using more informative interactions from similar +users. To verify the effectiveness and versatility of our proposed enhancement +framework, we conduct extensive experiments on three real-world datasets using +three popular SRS models. The results show that our method surpasses existing +baselines consistently, and benefits long-tail users and items especially. The +implementation code is available at +https://github.com/Applied-Machine-Learning-Lab/LLM-ESR. + +
+
+ comment: accepted by NeruIPS'24 (Spotlight) +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The dev dataset evaluation +results and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ ProSwitch: Knowledge-Guided Instruction Tuning to Switch Between + Professional and Non-Professional Answers + + +
+ Large Language Models (LLMs) have demonstrated efficacy in various linguistic +applications, including text summarization and controlled text generation. +However, studies into their capacity of switching between styles via +instruction tuning remain underexplored. This study concentrates on the +style-switching abilities of LLMs and introduces a novel approach, named +ProSwitch, which enables a language model to switch between professional and +non-professional answers, by tuning and evaluating through the guidance of +domain and style knowledge. ProSwitch unfolds across three phases: +LLM-augmented preparation to collect domain knowledge and QA pairs, instruction +tuning to optimize LLMs with multiple levels of knowledge, and comprehensive +evaluation to assess both style discrimination and reference-based quality of +generated text. Comparative analysis of ProSwitch against general and +specialized LLMs reveals that our approach outperforms baselines in switching +between professional and non-professional answers. + +
+
+ comment: 8 pages main body, 16 pages total +
+
+
+
+
+ + ♻ ☆ Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies NeurIPS 2024 + + +
+ Research on scaling large language models (LLMs) has primarily focused on +model parameters and training data size, overlooking the role of vocabulary +size. We investigate how vocabulary size impacts LLM scaling laws by training +models ranging from 33M to 3B parameters on up to 500B characters with various +vocabulary configurations. We propose three complementary approaches for +predicting the compute-optimal vocabulary size: IsoFLOPs analysis, derivative +estimation, and parametric fit of the loss function. Our approaches converge on +the conclusion that the optimal vocabulary size depends on the compute budget, +with larger models requiring larger vocabularies. Most LLMs, however, use +insufficient vocabulary sizes. For example, we predict that the optimal +vocabulary size of Llama2-70B should have been at least 216K, 7 times larger +than its vocabulary of 32K. We validate our predictions empirically by training +models with 3B parameters across different FLOPs budgets. Adopting our +predicted optimal vocabulary size consistently improves downstream performance +over commonly used vocabulary sizes. By increasing the vocabulary size from the +conventional 32K to 43K, we improve performance on ARC-Challenge from 29.1 to +32.0 with the same 2.3e21 FLOPs. Our work highlights the importance of jointly +considering tokenization and model scaling for efficient pre-training. The code +and demo are available at https://github.com/sail-sg/scaling-with-vocab and +https://hf.co/spaces/sail/scaling-with-vocab-demo. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ MoA: Mixture of Sparse Attention for Automatic Large Language Model + Compression + + +
+ Sparse attention can effectively mitigate the significant memory and +throughput demands of Large Language Models (LLMs) in long contexts. Existing +methods typically employ a uniform sparse attention mask, applying the same +sparse pattern across different attention heads and input lengths. However, +this uniform approach fails to capture the diverse attention patterns inherent +in LLMs, ignoring their distinct accuracy-latency trade-offs. To address this +challenge, we propose the Mixture of Attention (MoA), which automatically +tailors distinct sparse attention configurations to different heads and layers. +MoA constructs and navigates a search space of various attention patterns and +their scaling rules relative to input sequence lengths. It profiles the model, +evaluates potential configurations, and pinpoints the optimal sparse attention +compression plan. MoA adapts to varying input sizes, revealing that some +attention heads expand their focus to accommodate longer sequences, while other +heads consistently concentrate on fixed-length local contexts. Experiments show +that MoA increases the effective context length by $3.9\times$ with the same +average attention span, boosting retrieval accuracy by $1.5-7.1\times$ over the +uniform-attention baseline across Vicuna-{7B,13B}, and Llama3-{8B,70B} models. +Moreover, MoA narrows the capability gaps between sparse and dense models, +reducing the maximum relative performance drop from $9\%-36\%$ to within $5\%$ +across two long-context understanding benchmarks. MoA achieves a +$1.2-1.4\times$ GPU memory reduction, boosting decode throughput by +$6.6-8.2\times$ and $1.7-1.9\times$ compared to FlashAttention2 and vLLM, with +minimal impact on performance. Our code is available at +\url{https://github.com/thu-nics/MoA}. + +
+
+
+
+
+ + ♻ ☆ CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the + Mathematics Reasoning of Large Multimodal Models + + +
+ Large language models (LLMs) have obtained promising results in mathematical +reasoning, which is a foundational skill for human intelligence. Most previous +studies focus on improving and measuring the performance of LLMs based on +textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few +researchers have released English multimodal math datasets (e.g., MATHVISTA and +MATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In +this paper, we release a Chinese multimodal math (CMM-Math) dataset, including +benchmark and training parts, to evaluate and enhance the mathematical +reasoning of LMMs. CMM-Math contains over 28,000 high-quality samples, +featuring a variety of problem types (e.g., multiple-choice, fill-in-the-blank, +and so on) with detailed solutions across 12 grade levels from elementary to +high school in China. Specifically, the visual context may be present in the +questions or opinions, which makes this dataset more challenging. Through +comprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math +dataset face challenges, emphasizing the necessity for further improvements in +LMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to +handle the problems with mixed input of multiple images and text segments. We +train our model using three stages, including foundational pre-training, +foundational fine-tuning, and mathematical fine-tuning. The extensive +experiments indicate that our model effectively improves math reasoning +performance by comparing it with the SOTA LMMs over three multimodal +mathematical datasets. + +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by democratic deliberation theory, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ Multi-environment Topic Models + + +
+ Probabilistic topic models are a powerful tool for extracting latent themes +from large text datasets. In many text datasets, we also observe per-document +covariates (e.g., source, style, political affiliation) that act as +environments that modulate a "global" (environment-agnostic) topic +representation. Accurately learning these representations is important for +prediction on new documents in unseen environments and for estimating the +causal effect of topics on real-world outcomes. To this end, we introduce the +Multi-environment Topic Model (MTM), an unsupervised probabilistic model that +separates global and environment-specific terms. Through experimentation on +various political content, from ads to tweets and speeches, we show that the +MTM produces interpretable global topics with distinct environment-specific +words. On multi-environment data, the MTM outperforms strong baselines in and +out-of-distribution. It also enables the discovery of accurate causal effects. + +
+
+
+
+
+ + ♻ ☆ Is your benchmark truly adversarial? AdvScore: Evaluating Human-Grounded + Adversarialness + + +
+ Adversarial datasets should ensure AI robustness that matches human +performance. However, as models evolve, datasets can become obsolete. Thus, +adversarial datasets should be periodically updated based on their degradation +in adversarialness. Given the lack of a standardized metric for measuring +adversarialness, we propose AdvScore, a human-grounded evaluation metric. +AdvScore assesses a dataset's true adversarialness by capturing models' and +humans' varying abilities, while also identifying poor examples. AdvScore then +motivates a new dataset creation pipeline for realistic and high-quality +adversarial samples, enabling us to collect an adversarial question answering +(QA) dataset, AdvQA. We apply AdvScore using 9,347 human responses and ten +language model predictions to track the models' improvement over five years +(from 2020 to 2024). AdvScore assesses whether adversarial datasets remain +suitable for model evaluation, measures model improvements, and provides +guidance for better alignment with human capabilities. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.11185 +
+
+
+
+
+ + ♻ ☆ Table Transformers for Imputing Textual Attributes + + +
+ Missing data in tabular dataset is a common issue as the performance of +downstream tasks usually depends on the completeness of the training dataset. +Previous missing data imputation methods focus on numeric and categorical +columns, but we propose a novel end-to-end approach called Table Transformers +for Imputing Textual Attributes (TTITA) based on the transformer to impute +unstructured textual columns using other columns in the table. We conduct +extensive experiments on three datasets, and our approach shows competitive +performance outperforming baseline models such as recurrent neural networks and +Llama2. The performance improvement is more significant when the target +sequence has a longer length. Additionally, we incorporate multi-task learning +to simultaneously impute for heterogeneous columns, boosting the performance +for text imputation. We also qualitatively compare with ChatGPT for realistic +applications. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 46 + +
+
+
+ + ♻ ☆ VascX Models: Model Ensembles for Retinal Vascular Analysis from Color + Fundus Images + + +
+ We introduce VascX models, a comprehensive set of model ensembles for +analyzing retinal vasculature from color fundus images (CFIs). Annotated CFIs +were aggregated from public datasets . Additional CFIs, mainly from the +population-based Rotterdam Study were annotated by graders for arteries and +veins at pixel level, resulting in a dataset diverse in patient demographics +and imaging conditions. VascX models demonstrated superior segmentation +performance across datasets, image quality levels, and anatomic regions when +compared to existing, publicly available models, likely due to the increased +size and variety of our training set. Important improvements were observed in +artery-vein and disc segmentation performance, particularly in segmentations of +these structures on CFIs of intermediate quality, common in large cohorts and +clinical datasets. Importantly, these improvements translated into +significantly more accurate vascular features when we compared features +extracted from VascX segmentation masks with features extracted from +segmentation masks generated by previous models. With VascX models we provide a +robust, ready-to-use set of model ensembles and inference code aimed at +simplifying the implementation and enhancing the quality of automated retinal +vasculature analyses. The precise vessel parameters generated by the model can +serve as starting points for the identification of disease patterns in and +outside of the eye. + +
+
+
+
+
+ + ♻ ☆ DELTA: Dense Efficient Long-range 3D Tracking for any video + + +
+ Tracking dense 3D motion from monocular videos remains challenging, +particularly when aiming for pixel-level precision over long sequences. We +introduce DELTA, a novel method that efficiently tracks every pixel in 3D +space, enabling accurate motion estimation across entire videos. Our approach +leverages a joint global-local attention mechanism for reduced-resolution +tracking, followed by a transformer-based upsampler to achieve high-resolution +predictions. Unlike existing methods, which are limited by computational +inefficiency or sparse tracking, DELTA delivers dense 3D tracking at scale, +running over 8x faster than previous methods while achieving state-of-the-art +accuracy. Furthermore, we explore the impact of depth representation on +tracking performance and identify log-depth as the optimal choice. Extensive +experiments demonstrate the superiority of DELTA on multiple benchmarks, +achieving new state-of-the-art results in both 2D and 3D dense tracking tasks. +Our method provides a robust solution for applications requiring fine-grained, +long-term motion tracking in 3D space. + +
+
+ comment: Project Page: https://snap-research.github.io/DELTA/ +
+
+
+
+
+ + ♻ ☆ BehAVE: Behaviour Alignment of Video Game Encodings + + +
+ Domain randomisation enhances the transferability of vision models across +visually distinct domains with similar content. However, current methods +heavily depend on intricate simulation engines, hampering feasibility and +scalability. This paper introduces BehAVE, a video understanding framework that +utilises existing commercial video games for domain randomisation without +accessing their simulation engines. BehAVE taps into the visual diversity of +video games for randomisation and uses textual descriptions of player actions +to align videos with similar content. We evaluate BehAVE across 25 first-person +shooter (FPS) games using various video and text foundation models, +demonstrating its robustness in domain randomisation. BehAVE effectively aligns +player behavioural patterns and achieves zero-shot transfer to multiple unseen +FPS games when trained on just one game. In a more challenging scenario, BehAVE +enhances the zero-shot transferability of foundation models to unseen FPS +games, even when trained on a game of a different genre, with improvements of +up to 22%. BehAVE is available online at https://github.com/nrasajski/BehAVE. + +
+
+
+
+
+ + ♻ ☆ Aligning Motion-Blurred Images Using Contrastive Learning on + Overcomplete Pixels + + +
+ We propose a new contrastive objective for learning overcomplete pixel-level +features that are invariant to motion blur. Other invariances (e.g., pose, +illumination, or weather) can be learned by applying the corresponding +transformations on unlabeled images during self-supervised training. We +showcase that a simple U-Net trained with our objective can produce local +features useful for aligning the frames of an unseen video captured with a +moving camera under realistic and challenging conditions. Using a carefully +designed toy example, we also show that the overcomplete pixels can encode the +identity of objects in an image and the pixel coordinates relative to these +objects. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ GeoSplatting: Towards Geometry Guided Gaussian Splatting for + Physically-based Inverse Rendering + + +
+ We consider the problem of physically-based inverse rendering using 3D +Gaussian Splatting (3DGS) representations. While recent 3DGS methods have +achieved remarkable results in novel view synthesis (NVS), accurately capturing +high-fidelity geometry, physically interpretable materials and lighting remains +challenging, as it requires precise geometry modeling to provide accurate +surface normals, along with physically-based rendering (PBR) techniques to +ensure correct material and lighting disentanglement. Previous 3DGS methods +resort to approximating surface normals, but often struggle with noisy local +geometry, leading to inaccurate normal estimation and suboptimal +material-lighting decomposition. In this paper, we introduce GeoSplatting, a +novel hybrid representation that augments 3DGS with explicit geometric guidance +and differentiable PBR equations. Specifically, we bridge isosurface and 3DGS +together, where we first extract isosurface mesh from a scalar field, then +convert it into 3DGS points and formulate PBR equations for them in a fully +differentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry, +enabling precise surface normal modeling, which facilitates the use of PBR +frameworks for material decomposition. This approach further maintains the +efficiency and quality of NVS from 3DGS while ensuring accurate geometry from +the isosurface. Comprehensive evaluations across diverse datasets demonstrate +the superiority of GeoSplatting, consistently outperforming existing methods +both quantitatively and qualitatively. + +
+
+ comment: Project page: https://pku-vcl-geometry.github.io/GeoSplatting/ +
+
+
+
+
+ + ♻ ☆ HENASY: Learning to Assemble Scene-Entities for Egocentric + Video-Language Model NeurIPS 2024 + + +
+ Current video-language models (VLMs) rely extensively on instance-level +alignment between video and language modalities, which presents two major +limitations: (1) visual reasoning disobeys the natural perception that humans +do in first-person perspective, leading to a lack of reasoning interpretation; +and (2) learning is limited in capturing inherent fine-grained relationships +between two modalities. + In this paper, we take an inspiration from human perception and explore a +compositional approach for egocentric video representation. We introduce HENASY +(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token +grouping mechanism to explicitly assemble dynamically evolving scene entities +through time and model their relationship for video representation. By +leveraging compositional structure understanding, HENASY possesses strong +interpretability via visual grounding with free-form text queries. We further +explore a suite of multi-grained contrastive losses to facilitate +entity-centric understandings. This comprises three alignment types: +video-narration, noun-entity, verb-entities alignments. + Our method demonstrates strong interpretability in both quantitative and +qualitative experiments; while maintaining competitive performances on five +downstream tasks via zero-shot transfer or as video/text representation, +including video/text retrieval, action recognition, multi-choice query, natural +language query, and moments query. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ CaptainCook4D: A Dataset for Understanding Errors in Procedural + Activities + + +
+ Following step-by-step procedures is an essential component of various +activities carried out by individuals in their daily lives. These procedures +serve as a guiding framework that helps to achieve goals efficiently, whether +it is assembling furniture or preparing a recipe. However, the complexity and +duration of procedural activities inherently increase the likelihood of making +errors. Understanding such procedural activities from a sequence of frames is a +challenging task that demands an accurate interpretation of visual information +and the ability to reason about the structure of the activity. To this end, we +collect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings +(94.5 hours) of people performing recipes in real kitchen environments. This +dataset consists of two distinct types of activity: one in which participants +adhere to the provided recipe instructions and another in which they deviate +and induce errors. We provide 5.3K step annotations and 10K fine-grained action +annotations and benchmark the dataset for the following tasks: supervised error +recognition, multistep localization, and procedure learning + +
+
+ comment: Accepted to the 2024 Neural Information Processing Systems Datasets + and Benchmarks Track, Project Page: + https://captaincook4d.github.io/captain-cook/ +
+
+
+
+
+ + ♻ ☆ Comparing YOLO11 and YOLOv8 for instance segmentation of occluded and + non-occluded immature green fruits in complex orchard environment + + +
+ This study conducted a comprehensive performance evaluation on YOLO11 and +YOLOv8, the latest in the "You Only Look Once" (YOLO) series, focusing on their +instance segmentation capabilities for immature green apples in orchard +environments. YOLO11n-seg achieved the highest mask precision across all +categories with a notable score of 0.831, highlighting its effectiveness in +fruit detection. YOLO11m-seg and YOLO11l-seg excelled in non-occluded and +occluded fruitlet segmentation with scores of 0.851 and 0.829, respectively. +Additionally, YOLO11x-seg led in mask recall for all categories, achieving a +score of 0.815, with YOLO11m-seg performing best for non-occluded immature +green fruitlets at 0.858 and YOLOv8x-seg leading the occluded category with +0.800. In terms of mean average precision at a 50\% intersection over union +(mAP@50), YOLO11m-seg consistently outperformed, registering the highest scores +for both box and mask segmentation, at 0.876 and 0.860 for the "All" class and +0.908 and 0.909 for non-occluded immature fruitlets, respectively. YOLO11l-seg +and YOLOv8l-seg shared the top box mAP@50 for occluded immature fruitlets at +0.847, while YOLO11m-seg achieved the highest mask mAP@50 of 0.810. Despite the +advancements in YOLO11, YOLOv8n surpassed its counterparts in image processing +speed, with an impressive inference speed of 3.3 milliseconds, compared to the +fastest YOLO11 series model at 4.8 milliseconds, underscoring its suitability +for real-time agricultural applications related to complex green fruit +environments. + +
+
+ comment: 16 Pages, 10 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ Digital Twins in Additive Manufacturing: A Systematic Review + + +
+ Digital Twins (DTs) are becoming popular in Additive Manufacturing (AM) due +to their ability to create virtual replicas of physical components of AM +machines, which helps in real-time production monitoring. Advanced techniques +such as Machine Learning (ML), Augmented Reality (AR), and simulation-based +models play key roles in developing intelligent and adaptable DTs in +manufacturing processes. However, questions remain regarding scalability, the +integration of high-quality data, and the computational power required for +real-time applications in developing DTs. Understanding the current state of +DTs in AM is essential to address these challenges and fully utilize their +potential in advancing AM processes. Considering this opportunity, this work +aims to provide a comprehensive overview of DTs in AM by addressing the +following four research questions: (1) What are the key types of DTs used in AM +and their specific applications? (2) What are the recent developments and +implementations of DTs? (3) How are DTs employed in process improvement and +hybrid manufacturing? (4) How are DTs integrated with Industry 4.0 +technologies? By discussing current applications and techniques, we aim to +offer a better understanding and potential future research directions for +researchers and practitioners in AM and DTs. + +
+
+
+
+
+ + ♻ ☆ A survey on deep learning in medical image registration: new + technologies, uncertainty, evaluation metrics, and beyond + + +
+ Deep learning technologies have dramatically reshaped the field of medical +image registration over the past decade. The initial developments, such as +regression-based and U-Net-based networks, established the foundation for deep +learning in image registration. Subsequent progress has been made in various +aspects of deep learning-based registration, including similarity measures, +deformation regularizations, network architectures, and uncertainty estimation. +These advancements have not only enriched the field of image registration but +have also facilitated its application in a wide range of tasks, including atlas +construction, multi-atlas segmentation, motion estimation, and 2D-3D +registration. In this paper, we present a comprehensive overview of the most +recent advancements in deep learning-based image registration. We begin with a +concise introduction to the core concepts of deep learning-based image +registration. Then, we delve into innovative network architectures, loss +functions specific to registration, and methods for estimating registration +uncertainty. Additionally, this paper explores appropriate evaluation metrics +for assessing the performance of deep learning models in registration tasks. +Finally, we highlight the practical applications of these novel techniques in +medical imaging and discuss the future prospects of deep learning-based image +registration. + +
+
+ comment: Accepted to Medical Image Analysis ((c) MedIA). A list of + open-sourced code from the papers reviewed has been organized and is + available at https://bit.ly/3QgFJ9z +
+
+
+
+
+ + ♻ ☆ DenoiseRep: Denoising Model for Representation Learning NeurIPS 2024 + + +
+ The denoising model has been proven a powerful generative model but has +little exploration of discriminative tasks. Representation learning is +important in discriminative tasks, which is defined as "learning +representations (or features) of the data that make it easier to extract useful +information when building classifiers or other predictors". In this paper, we +propose a novel Denoising Model for Representation Learning (DenoiseRep) to +improve feature discrimination with joint feature extraction and denoising. +DenoiseRep views each embedding layer in a backbone as a denoising layer, +processing the cascaded embedding layers as if we are recursively denoise +features step-by-step. This unifies the frameworks of feature extraction and +denoising, where the former progressively embeds features from low-level to +high-level, and the latter recursively denoises features step-by-step. After +that, DenoiseRep fuses the parameters of feature extraction and denoising +layers, and theoretically demonstrates its equivalence before and after the +fusion, thus making feature denoising computation-free. DenoiseRep is a +label-free algorithm that incrementally improves features but also +complementary to the label if available. Experimental results on various +discriminative vision tasks, including re-identification (Market-1501, +DukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet, +UB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation +(ADE20K) show stability and impressive improvements. We also validate its +effectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda) +architectures. + +
+
+ comment: Accepted by NeurIPS 2024,oral +
+
+
+
+
+ + ♻ ☆ Return of Unconditional Generation: A Self-supervised Representation + Generation Method + + +
+ Unconditional generation -- the problem of modeling data distribution without +relying on human-annotated labels -- is a long-standing and fundamental +challenge in generative models, creating a potential of learning from +large-scale unlabeled data. In the literature, the generation quality of an +unconditional method has been much worse than that of its conditional +counterpart. This gap can be attributed to the lack of semantic information +provided by labels. In this work, we show that one can close this gap by +generating semantic representations in the representation space produced by a +self-supervised encoder. These representations can be used to condition the +image generator. This framework, called Representation-Conditioned Generation +(RCG), provides an effective solution to the unconditional generation problem +without using labels. Through comprehensive experiments, we observe that RCG +significantly improves unconditional generation quality: e.g., it achieves a +new state-of-the-art FID of 2.15 on ImageNet 256x256, largely reducing the +previous best of 5.91 by a relative 64%. Our unconditional results are situated +in the same tier as the leading class-conditional ones. We hope these +encouraging observations will attract the community's attention to the +fundamental problem of unconditional generation. Code is available at +https://github.com/LTH14/rcg. + +
+
+ comment: Neurips 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking + + +
+ Self-Supervised Learning (SSL) is an effective paradigm for learning +representations from unlabeled data, such as text, images, and videos. However, +researchers have recently found that SSL is vulnerable to backdoor attacks. The +attacker can embed hidden SSL backdoors via a few poisoned examples in the +training dataset and maliciously manipulate the behavior of downstream models. +To defend against SSL backdoor attacks, a feasible route is to detect and +remove the poisonous samples in the training set. However, the existing SSL +backdoor defense method fails to detect the poisonous samples precisely. In +this paper, we propose to erase the SSL backdoor by cluster activation masking +and propose a novel PoisonCAM method. After obtaining the threat model trained +on the poisoned dataset, our method can precisely detect poisonous samples +based on the assumption that masking the backdoor trigger can effectively +change the activation of a downstream clustering model. In experiments, our +PoisonCAM achieves 96\% accuracy for backdoor trigger detection compared to 3\% +of the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed +PoisonCAM significantly improves the performance of the trained SSL model under +backdoor attacks compared to the state-of-the-art method. Our code, data, and +trained models will be open once this paper is accepted. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Image Generation without Vector Quantization + + +
+ Conventional wisdom holds that autoregressive models for image generation are +typically accompanied by vector-quantized tokens. We observe that while a +discrete-valued space can facilitate representing a categorical distribution, +it is not a necessity for autoregressive modeling. In this work, we propose to +model the per-token probability distribution using a diffusion procedure, which +allows us to apply autoregressive models in a continuous-valued space. Rather +than using categorical cross-entropy loss, we define a Diffusion Loss function +to model the per-token probability. This approach eliminates the need for +discrete-valued tokenizers. We evaluate its effectiveness across a wide range +of cases, including standard autoregressive models and generalized masked +autoregressive (MAR) variants. By removing vector quantization, our image +generator achieves strong results while enjoying the speed advantage of +sequence modeling. We hope this work will motivate the use of autoregressive +generation in other continuous-valued domains and applications. Code is +available at: https://github.com/LTH14/mar. + +
+
+ comment: Neurips 2024 (Spotlight). Code: https://github.com/LTH14/mar +
+
+
+
+
+ + ♻ ☆ Disentangling spatio-temporal knowledge for weakly supervised object + detection and segmentation in surgical video WACV + + +
+ Weakly supervised video object segmentation (WSVOS) enables the +identification of segmentation maps without requiring an extensive training +dataset of object masks, relying instead on coarse video labels indicating +object presence. Current state-of-the-art methods either require multiple +independent stages of processing that employ motion cues or, in the case of +end-to-end trainable networks, lack in segmentation accuracy, in part due to +the difficulty of learning segmentation maps from videos with transient object +presence. This limits the application of WSVOS for semantic annotation of +surgical videos where multiple surgical tools frequently move in and out of the +field of view, a problem that is more difficult than typically encountered in +WSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks +(VDST-Net), a framework to disentangle spatiotemporal information using +semi-decoupled knowledge distillation to predict high-quality class activation +maps (CAMs). A teacher network designed to resolve temporal conflicts when +specifics about object location and timing in the video are not provided works +with a student network that integrates information over time by leveraging +temporal dependencies. We demonstrate the efficacy of our framework on a public +reference dataset and on a more challenging surgical video dataset where +objects are, on average, present in less than 60\% of annotated frames. Our +method outperforms state-of-the-art techniques and generates superior +segmentation masks under video-level weak supervision. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) +
+
+
+
+
+ + ♻ ☆ ConvBKI: Real-Time Probabilistic Semantic Mapping Network with + Quantifiable Uncertainty + + +
+ In this paper, we develop a modular neural network for real-time +{\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which +explicitly updates per-voxel probabilistic distributions within a neural +network layer. Our approach combines the reliability of classical probabilistic +algorithms with the performance and efficiency of modern neural networks. +Although robotic perception is often divided between modern differentiable +methods and classical explicit methods, a union of both is necessary for +real-time and trustworthy performance. We introduce a novel Convolutional +Bayesian Kernel Inference (ConvBKI) layer which incorporates semantic +segmentation predictions online into a 3D map through a depthwise convolution +layer by leveraging conjugate priors. We compare ConvBKI against +state-of-the-art deep learning approaches and probabilistic algorithms for +mapping to evaluate reliability and performance. We also create a Robot +Operating System (ROS) package of ConvBKI and test it on real-world +perceptually challenging off-road driving data. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.10663 +
+
+
+
+
+ + ♻ ☆ Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal + satellite dataset for rapid flood mapping NeurIPS 2024 + + +
+ Global floods, exacerbated by climate change, pose severe threats to human +life, infrastructure, and the environment. Recent catastrophic events in +Pakistan and New Zealand underscore the urgent need for precise flood mapping +to guide restoration efforts, understand vulnerabilities, and prepare for +future occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers +day-and-night, all-weather imaging capabilities, its application in deep +learning for flood segmentation is limited by the lack of large annotated +datasets. To address this, we introduce Kuro Siwo, a manually annotated +multi-temporal dataset, spanning 43 flood events globally. Our dataset maps +more than 338 billion $m^2$ of land, with 33 billion designated as either +flooded areas or permanent water bodies. Kuro Siwo includes a highly processed +product optimized for flood mapping based on SAR Ground Range Detected, and a +primal SAR Single Look Complex product with minimal preprocessing, designed to +promote research on the exploitation of both the phase and amplitude +information and to offer maximum flexibility for downstream task preprocessing. +To leverage advances in large scale self-supervised pretraining methods for +remote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR +samples. Finally, we provide an extensive benchmark, namely BlackBench, +offering strong baselines for a diverse set of flood events from Europe, +America, Africa, Asia and Australia. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems are crucial for cartography, disaster +surveillance, and resource administration. Nonetheless, they encounter +considerable obstacles in the processing and transmission of extensive data, +especially in specialized domains such as precision agriculture and real-time +disaster response. Earth observation satellites, outfitted with remote sensing +technology, gather data from onboard sensors and IoT-enabled terrestrial +objects, delivering important information remotely. Domain-adapted Large +Language Models (LLMs) provide a solution by enabling the integration of raw +and processed EO data. Through domain adaptation, LLMs improve the assimilation +and analysis of many data sources, tackling the intricacies of specialized +datasets in agriculture and disaster response. This data synthesis, directed by +LLMs, enhances the precision and pertinence of conveyed information. This study +provides a thorough examination of using semantic inference and deep learning +for sophisticated EO systems. It presents an innovative architecture for +semantic communication in EO satellite networks, designed to improve data +transmission efficiency using semantic processing methodologies. Recent +advancements in onboard processing technologies enable dependable, adaptable, +and energy-efficient data management in orbit. These improvements guarantee +reliable performance in adverse space circumstances using radiation-hardened +and reconfigurable technology. Collectively, these advancements enable +next-generation satellite missions with improved processing capabilities, +crucial for operational flexibility and real-time decision-making in 6G +satellite communication. + +
+
+ comment: 17 pages, 7 figures, Journal +
+
+
+
+
+ + ♻ ☆ Video Diffusion Models are Training-free Motion Interpreter and + Controller + + +
+ Video generation primarily aims to model authentic and customized motion +across frames, making understanding and controlling the motion a crucial topic. +Most diffusion-based studies on video motion focus on motion customization with +training-based paradigms, which, however, demands substantial training +resources and necessitates retraining for diverse models. Crucially, these +approaches do not explore how video diffusion models encode cross-frame motion +information in their features, lacking interpretability and transparency in +their effectiveness. To answer this question, this paper introduces a novel +perspective to understand, localize, and manipulate motion-aware features in +video diffusion models. Through analysis using Principal Component Analysis +(PCA), our work discloses that robust motion-aware feature already exists in +video diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating +content correlation information and filtering motion channels. MOFT provides a +distinct set of benefits, including the ability to encode comprehensive motion +information with clear interpretability, extraction without the need for +training, and generalizability across diverse architectures. Leveraging MOFT, +we propose a novel training-free video motion control framework. Our method +demonstrates competitive performance in generating natural and faithful motion, +providing architecture-agnostic insights and applicability in a variety of +downstream tasks. + +
+
+ comment: Project Page: https://xizaoqu.github.io/moft/ +
+
+
+
+
+ + ♻ ☆ Improving Generalization in Visual Reasoning via Self-Ensemble + + +
+ The cognitive faculty of visual reasoning necessitates the integration of +multimodal perceptual processing and commonsense and external knowledge of the +world. In recent years, a plethora of large vision-language models (LVLMs) have +been proposed, demonstrating outstanding power and exceptional proficiency in +commonsense reasoning across diverse domains and tasks. Nevertheless, training +such LVLMs requires a lot of costly resources. Recent approaches, instead of +training LVLMs from scratch on various large datasets, focus on exploring ways +to take advantage of the capabilities of many different LVLMs, such as ensemble +methods. In this work, we propose self-ensemble, a novel method that improves +the generalization and visual reasoning of the model without updating any +parameters, a training-free method. Our key insight is that we realized that +LVLM itself can ensemble without the need for any other LVLMs, which helps to +unlock their internal capabilities. Extensive experiments on various benchmarks +demonstrate the effectiveness of our method in achieving state-of-the-art +(SOTA) performance on SketchyVQA, Outside Knowledge VQA, and +out-of-distribution VQA tasks. + +
+
+
+
+
+ + ♻ ☆ FRoundation: Are Foundation Models Ready for Face Recognition? + + +
+ Foundation models are predominantly trained in an unsupervised or +self-supervised manner on highly diverse and large-scale datasets, making them +broadly applicable to various downstream tasks. In this work, we investigate +for the first time whether such models are suitable for the specific domain of +face recognition. We further propose and demonstrate the adaptation of these +models for face recognition across different levels of data availability. +Extensive experiments are conducted on multiple foundation models and datasets +of varying scales for training and fine-tuning, with evaluation on a wide range +of benchmarks. Our results indicate that, despite their versatility, +pre-trained foundation models underperform in face recognition compared to +similar architectures trained specifically for this task. However, fine-tuning +foundation models yields promising results, often surpassing models trained +from scratch when training data is limited. Even with access to large-scale +face recognition training datasets, fine-tuned foundation models perform +comparably to models trained from scratch, but with lower training +computational costs and without relying on the assumption of extensive data +availability. Our analysis also explores bias in face recognition, with +slightly higher bias observed in some settings when using foundation models. + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models, +especially for long video understanding. We introduce LongVILA, a full-stack +solution for long-context visual-language models by co-designing the algorithm +and system. For model training, we upgrade existing VLMs to support long video +understanding by incorporating two additional stages, i.e., long context +extension and long video supervised fine-tuning. However, training on long +video is computationally and memory intensive. We introduce the long-context +Multi-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes +long video training and inference, enabling 2M context length training on 256 +GPUs without any gradient checkpointing. LongVILA efficiently extends the +number of video frames of VILA from 8 to 2048, improving the long video +captioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in +6,000-frame (more than 1 million tokens) video needle-in-a-haystack. +LongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8% +with subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence +parallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and +tensor parallelism. Moreover, it seamlessly integrates with Hugging Face +Transformers. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ Conditional GAN for Enhancing Diffusion Models in Efficient and + Authentic Global Gesture Generation from Audios WACV 2025 + + +
+ Audio-driven simultaneous gesture generation is vital for human-computer +communication, AI games, and film production. While previous research has shown +promise, there are still limitations. Methods based on VAEs are accompanied by +issues of local jitter and global instability, whereas methods based on +diffusion models are hampered by low generation efficiency. This is because the +denoising process of DDPM in the latter relies on the assumption that the noise +added at each step is sampled from a unimodal distribution, and the noise +values are small. DDIM borrows the idea from the Euler method for solving +differential equations, disrupts the Markov chain process, and increases the +noise step size to reduce the number of denoising steps, thereby accelerating +generation. However, simply increasing the step size during the step-by-step +denoising process causes the results to gradually deviate from the original +data distribution, leading to a significant drop in the quality of the +generated actions and the emergence of unnatural artifacts. In this paper, we +break the assumptions of DDPM and achieves breakthrough progress in denoising +speed and fidelity. Specifically, we introduce a conditional GAN to capture +audio control signals and implicitly match the multimodal denoising +distribution between the diffusion and denoising steps within the same sampling +step, aiming to sample larger noise values and apply fewer denoising steps for +high-speed generation. + +
+
+ comment: Accepted by WACV 2025 (Round 1) +
+
+
+
+
+ + ♻ ☆ RopeTP: Global Human Motion Recovery via Integrating Robust Pose + Estimation with Diffusion Trajectory Prior WACV 2025 + + +
+ We present RopeTP, a novel framework that combines Robust pose estimation +with a diffusion Trajectory Prior to reconstruct global human motion from +videos. At the heart of RopeTP is a hierarchical attention mechanism that +significantly improves context awareness, which is essential for accurately +inferring the posture of occluded body parts. This is achieved by exploiting +the relationships with visible anatomical structures, enhancing the accuracy of +local pose estimations. The improved robustness of these local estimations +allows for the reconstruction of precise and stable global trajectories. +Additionally, RopeTP incorporates a diffusion trajectory model that predicts +realistic human motion from local pose sequences. This model ensures that the +generated trajectories are not only consistent with observed local actions but +also unfold naturally over time, thereby improving the realism and stability of +3D human motion reconstruction. Extensive experimental validation shows that +RopeTP surpasses current methods on two benchmark datasets, particularly +excelling in scenarios with occlusions. It also outperforms methods that rely +on SLAM for initial camera estimates and extensive optimization, delivering +more accurate and realistic trajectories. + +
+
+ comment: Accepted by WACV 2025 (Round 1) +
+
+
+
+
+ + ♻ ☆ Adversarial Purification and Fine-tuning for Robust UDC Image + Restoration + + +
+ This study delves into the enhancement of Under-Display Camera (UDC) image +restoration models, focusing on their robustness against adversarial attacks. +Despite its innovative approach to seamless display integration, UDC technology +faces unique image degradation challenges exacerbated by the susceptibility to +adversarial perturbations. Our research initially conducts an in-depth +robustness evaluation of deep-learning-based UDC image restoration models by +employing several white-box and black-box attacking methods. This evaluation is +pivotal in understanding the vulnerabilities of current UDC image restoration +techniques. Following the assessment, we introduce a defense framework +integrating adversarial purification with subsequent fine-tuning processes. +First, our approach employs diffusion-based adversarial purification, +effectively neutralizing adversarial perturbations. Then, we apply the +fine-tuning methodologies to refine the image restoration models further, +ensuring that the quality and fidelity of the restored images are maintained. +The effectiveness of our proposed approach is validated through extensive +experiments, showing marked improvements in resilience against typical +adversarial attacks. + +
+
+ comment: Failure to meet expectations +
+
+
+
+
+ + ♻ ☆ Posture-Informed Muscular Force Learning for Robust Hand Pressure + Estimation NeurIPS 2024 + + +
+ We present PiMForce, a novel framework that enhances hand pressure estimation +by leveraging 3D hand posture information to augment forearm surface +electromyography (sEMG) signals. Our approach utilizes detailed spatial +information from 3D hand poses in conjunction with dynamic muscle activity from +sEMG to enable accurate and robust whole-hand pressure measurements under +diverse hand-object interactions. We also developed a multimodal data +collection system that combines a pressure glove, an sEMG armband, and a +markerless finger-tracking module. We created a comprehensive dataset from 21 +participants, capturing synchronized data of hand posture, sEMG signals, and +exerted hand pressure across various hand postures and hand-object interaction +scenarios using our collection system. Our framework enables precise hand +pressure estimation in complex and natural interaction scenarios. Our approach +substantially mitigates the limitations of traditional sEMG-based or +vision-based methods by integrating 3D hand posture information with sEMG +signals. Video demos, data, and code are available online. + +
+
+ comment: Accepted to NeurIPS 2024. Project Page Link: + https://pimforce.hcitech.org/ +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ Human Action Recognition (HAR) Using Skeleton-based Spatial Temporal + Relative Transformer Network: ST-RTR + + +
+ Human Action Recognition (HAR) is an interesting research area in +human-computer interaction used to monitor the activities of elderly and +disabled individuals affected by physical and mental health. In the recent era, +skeleton-based HAR has received much attention because skeleton data has shown +that it can handle changes in striking, body size, camera views, and complex +backgrounds. One key characteristic of ST-GCN is automatically learning spatial +and temporal patterns from skeleton sequences. It has some limitations, as this +method only works for short-range correlation due to its limited receptive +field. Consequently, understanding human action requires long-range +interconnection. To address this issue, we developed a spatial-temporal +relative transformer ST-RTR model. The ST-RTR includes joint and relay nodes, +which allow efficient communication and data transmission within the network. +These nodes help to break the inherent spatial and temporal skeleton +topologies, which enables the model to understand long-range human action +better. Furthermore, we combine ST-RTR with a fusion model for further +performance improvements. To assess the performance of the ST-RTR method, we +conducted experiments on three skeleton-based HAR benchmarks: NTU RGB+D 60, NTU +RGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and 1.45% on NTU RGB+D +60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets, accuracy improved +by 2.54%. The experimental outcomes explain that the proposed ST-RTR model +significantly improves action recognition associated with the standard ST-GCN +method. + +
+
+
+
+
+ + ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection ECCV 2024 + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. Our code will be available at +https://github.com/hoiliu-0801/DQ-DETR. + +
+
+ comment: Accepted by ECCV 2024. Our code will be available at + https://github.com/hoiliu-0801/DQ-DETR +
+
+
+
+
+ + ♻ ☆ Fast Samplers for Inverse Problems in Iterative Refinement Models NeurIPS'24 + + +
+ Constructing fast samplers for unconditional diffusion and flow-matching +models has received much attention recently; however, existing methods for +solving inverse problems, such as super-resolution, inpainting, or deblurring, +still require hundreds to thousands of iterative steps to obtain high-quality +results. We propose a plug-and-play framework for constructing efficient +samplers for inverse problems, requiring only pre-trained diffusion or +flow-matching models. We present Conditional Conjugate Integrators, which +leverage the specific form of the inverse problem to project the respective +conditional diffusion/flow dynamics into a more amenable space for sampling. +Our method complements popular posterior approximation methods for solving +inverse problems using diffusion/flow models. We evaluate the proposed method's +performance on various linear image restoration tasks across multiple datasets, +employing diffusion and flow-matching models. Notably, on challenging inverse +problems like 4x super-resolution on the ImageNet dataset, our method can +generate high-quality samples in as few as 5 conditional sampling steps and +outperforms competing baselines requiring 20-1000 steps. Our code will be +publicly available at https://github.com/mandt-lab/c-pigdm + +
+
+ comment: 43 pages, NeurIPS'24 Camera Ready +
+
+
+
+
+ + ♻ ☆ SMART: Scalable Multi-agent Real-time Motion Generation via Next-token + Prediction NeurIPS 2024 + + +
+ Data-driven autonomous driving motion generation tasks are frequently +impacted by the limitations of dataset size and the domain gap between +datasets, which precludes their extensive application in real-world scenarios. +To address this issue, we introduce SMART, a novel autonomous driving motion +generation paradigm that models vectorized map and agent trajectory data into +discrete sequence tokens. These tokens are then processed through a +decoder-only transformer architecture to train for the next token prediction +task across spatial-temporal series. This GPT-style method allows the model to +learn the motion distribution in real driving scenarios. SMART achieves +state-of-the-art performance across most of the metrics on the generative Sim +Agents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset +(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents +the generative model in the autonomous driving motion domain, exhibiting +zero-shot generalization capabilities: Using only the NuPlan dataset for +training and WOMD for validation, SMART achieved a competitive score of 0.72 on +the Sim Agents challenge. Lastly, we have collected over 1 billion motion +tokens from multiple datasets, validating the model's scalability. These +results suggest that SMART has initially emulated two important properties: +scalability and zero-shot generalization, and preliminarily meets the needs of +large-scale real-time simulation applications. We have released all the code to +promote the exploration of models for motion generation in the autonomous +driving field. The source code is available at +https://github.com/rainmaker22/SMART. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Field NeurIPS + 2024 + + +
+ Neural radiance fields (NeRFs) have gained popularity with multiple works +showing promising results across various applications. However, to the best of +our knowledge, existing works do not explicitly model the distribution of +training camera poses, or consequently the triangulation quality, a key factor +affecting reconstruction quality dating back to classical vision literature. We +close this gap with ProvNeRF, an approach that models the \textbf{provenance} +for each point -- i.e., the locations where it is likely visible -- of NeRFs as +a stochastic field. We achieve this by extending implicit maximum likelihood +estimation (IMLE) to functional space with an optimizable objective. We show +that modeling per-point provenance during the NeRF optimization enriches the +model with information on triangulation leading to improvements in novel view +synthesis and uncertainty estimation under the challenging sparse, +unconstrained view setting against competitive baselines. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ Adaptive Visual Scene Understanding: Incremental Scene Graph Generation + + +
+ Scene graph generation (SGG) analyzes images to extract meaningful +information about objects and their relationships. In the dynamic visual world, +it is crucial for AI systems to continuously detect new objects and establish +their relationships with existing ones. Recently, numerous studies have focused +on continual learning within the domains of object detection and image +recognition. However, a limited amount of research focuses on a more +challenging continual learning problem in SGG. This increased difficulty arises +from the intricate interactions and dynamic relationships among objects, and +their associated contexts. Thus, in continual learning, SGG models are often +required to expand, modify, retain, and reason scene graphs within the process +of adaptive visual scene understanding. To systematically explore Continual +Scene Graph Generation (CSEGG), we present a comprehensive benchmark comprising +three learning regimes: relationship incremental, scene incremental, and +relationship generalization. Moreover, we introduce a ``Replays via Analysis by +Synthesis" method named RAS. This approach leverages the scene graphs, +decomposes and re-composes them to represent different scenes, and replays the +synthesized scenes based on these compositional scene graphs. The replayed +synthesized scenes act as a means to practice and refine proficiency in SGG in +known and unknown environments. Our experimental results not only highlight the +challenges of directly combining existing continual learning methods with SGG +backbones but also demonstrate the effectiveness of our proposed approach, +enhancing CSEGG efficiency while simultaneously preserving privacy and memory +usage. All data and source code are publicly available online. + +
+
+
+
+
+ + ♻ ☆ STONE: A Submodular Optimization Framework for Active 3D Object + Detection + + +
+ 3D object detection is fundamentally important for various emerging +applications, including autonomous driving and robotics. A key requirement for +training an accurate 3D object detector is the availability of a large amount +of LiDAR-based point cloud data. Unfortunately, labeling point cloud data is +extremely challenging, as accurate 3D bounding boxes and semantic labels are +required for each potential object. This paper proposes a unified active 3D +object detection framework, for greatly reducing the labeling cost of training +3D object detectors. Our framework is based on a novel formulation of +submodular optimization, specifically tailored to the problem of active 3D +object detection. In particular, we address two fundamental challenges +associated with active 3D object detection: data imbalance and the need to +cover the distribution of the data, including LiDAR-based point cloud data of +varying difficulty levels. Extensive experiments demonstrate that our method +achieves state-of-the-art performance with high computational efficiency +compared to existing active learning methods. The code is available at +https://github.com/RuiyuM/STONE. + +
+
+
+
+
+ + ♻ ☆ Make Continual Learning Stronger via C-Flat + + +
+ Model generalization ability upon incrementally acquiring dynamically +updating knowledge from sequentially arriving tasks is crucial to tackle the +sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape +sharpness minimization seeking for flat minima lying in neighborhoods with +uniform low loss or smooth gradient is proven to be a strong training regime +improving model generalization compared with loss minimization based optimizer +like SGD. Yet only a few works have discussed this training regime for CL, +proving that dedicated designed zeroth-order sharpness optimizer can improve CL +performance. In this work, we propose a Continual Flatness (C-Flat) method +featuring a flatter loss landscape tailored for CL. C-Flat could be easily +called with only one line of code and is plug-and-play to any CL methods. A +general framework of C-Flat applied to all CL categories and a thorough +comparison with loss minima optimizer and flat minima based CL approaches is +presented in this paper, showing that our method can boost CL performance in +almost all cases. Code is available at https://github.com/WanNaa/C-Flat. + +
+
+
+
+
+ + ♻ ☆ HSIGene: A Foundation Model For Hyperspectral Image Generation + + +
+ Hyperspectral image (HSI) plays a vital role in various fields such as +agriculture and environmental monitoring. However, due to the expensive +acquisition cost, the number of hyperspectral images is limited, degenerating +the performance of downstream tasks. Although some recent studies have +attempted to employ diffusion models to synthesize HSIs, they still struggle +with the scarcity of HSIs, affecting the reliability and diversity of the +generated images. Some studies propose to incorporate multi-modal data to +enhance spatial diversity, but the spectral fidelity cannot be ensured. In +addition, existing HSI synthesis models are typically uncontrollable or only +support single-condition control, limiting their ability to generate accurate +and reliable HSIs. To alleviate these issues, we propose HSIGene, a novel HSI +generation foundation model which is based on latent diffusion and supports +multi-condition control, allowing for more precise and reliable HSI generation. +To enhance the spatial diversity of the training data while preserving spectral +fidelity, we propose a new data augmentation method based on spatial +super-resolution, in which HSIs are upscaled first, and thus abundant training +patches could be obtained by cropping the high-resolution HSIs. In addition, to +improve the perceptual quality of the augmented data, we introduce a novel +two-stage HSI super-resolution framework, which first applies RGB bands +super-resolution and then utilizes our proposed Rectangular Guided Attention +Network (RGAN) for guided HSI super-resolution. Experiments demonstrate that +the proposed model is capable of generating a vast quantity of realistic HSIs +for downstream tasks such as denoising and super-resolution. The code and +models are available at https://github.com/LiPang/HSIGene. + +
+
+
+
+
+ + ♻ ☆ GrounDiT: Grounding Diffusion Transformers via Noisy Patch + Transplantation NeurIPS 2024 + + +
+ We introduce GrounDiT, a novel training-free spatial grounding technique for +text-to-image generation using Diffusion Transformers (DiT). Spatial grounding +with bounding boxes has gained attention for its simplicity and versatility, +allowing for enhanced user control in image generation. However, prior +training-free approaches often rely on updating the noisy image during the +reverse diffusion process via backpropagation from custom loss functions, which +frequently struggle to provide precise control over individual bounding boxes. +In this work, we leverage the flexibility of the Transformer architecture, +demonstrating that DiT can generate noisy patches corresponding to each +bounding box, fully encoding the target object and allowing for fine-grained +control over each region. Our approach builds on an intriguing property of DiT, +which we refer to as semantic sharing. Due to semantic sharing, when a smaller +patch is jointly denoised alongside a generatable-size image, the two become +semantic clones. Each patch is denoised in its own branch of the generation +process and then transplanted into the corresponding region of the original +noisy image at each timestep, resulting in robust spatial grounding for each +bounding box. In our experiments on the HRS and DrawBench benchmarks, we +achieve state-of-the-art performance compared to previous training-free +approaches. + +
+
+ comment: Accepted to NeurIPS 2024. Project Page: + https://groundit-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ In-Context LoRA for Diffusion Transformers + + +
+ Recent research arXiv:2410.15027 has explored the use of diffusion +transformers (DiTs) for task-agnostic image generation by simply concatenating +attention tokens across images. However, despite substantial computational +resources, the fidelity of the generated images remains suboptimal. In this +study, we reevaluate and streamline this framework by hypothesizing that +text-to-image DiTs inherently possess in-context generation capabilities, +requiring only minimal tuning to activate them. Through diverse task +experiments, we qualitatively demonstrate that existing text-to-image DiTs can +effectively perform in-context generation without any tuning. Building on this +insight, we propose a remarkably simple pipeline to leverage the in-context +abilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint +captioning of multiple images, and (3) apply task-specific LoRA tuning using +small datasets (e.g., $20\sim 100$ samples) instead of full-parameter tuning +with large datasets. We name our models In-Context LoRA (IC-LoRA). This +approach requires no modifications to the original DiT models, only changes to +the training data. Remarkably, our pipeline generates high-fidelity image sets +that better adhere to prompts. While task-specific in terms of tuning data, our +framework remains task-agnostic in architecture and pipeline, offering a +powerful tool for the community and providing valuable insights for further +research on product-level task-agnostic generation systems. We release our +code, data, and models at https://github.com/ali-vilab/In-Context-LoRA + +
+
+ comment: Tech report. Project page: + https://ali-vilab.github.io/In-Context-LoRA-Page/ +
+
+
+
+
+ + ♻ ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization in MLLMs, recent advances primarily focus on improving +the LLM components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector in MLLMs. Extensive ablation +experiments validate the effectiveness of introducing CMoE under any +configuration, with up to an average 8% performance gains. We further provide +interpretation analysis of the tug-of-war problem from the perspective of +gradient optimization and parameter statistics. Compared to previous +state-of-the-art medical MLLMs, Uni-Med achieves competitive or superior +evaluation metrics on diverse tasks. Code and resources are available at +https://github.com/tsinghua-msiip/Uni-Med. + +
+
+
+
+
+ + ♻ ☆ DPEC: Dual-Path Error Compensation Method for Enhanced Low-Light Image + Clarity + + +
+ For the task of low-light image enhancement, deep learning-based algorithms +have demonstrated superiority and effectiveness compared to traditional +methods. However, these methods, primarily based on Retinex theory, tend to +overlook the noise and color distortions in input images, leading to +significant noise amplification and local color distortions in enhanced +results. To address these issues, we propose the Dual-Path Error Compensation +(DPEC) method, designed to improve image quality under low-light conditions by +preserving local texture details while restoring global image brightness +without amplifying noise. DPEC incorporates precise pixel-level error +estimation to capture subtle differences and an independent denoising mechanism +to prevent noise amplification. We introduce the HIS-Retinex loss to guide +DPEC's training, ensuring the brightness distribution of enhanced images +closely aligns with real-world conditions. To balance computational speed and +resource efficiency while training DPEC for a comprehensive understanding of +the global context, we integrated the VMamba architecture into its backbone. +Comprehensive quantitative and qualitative experimental results demonstrate +that our algorithm significantly outperforms state-of-the-art methods in +low-light image enhancement. The code is publicly available online at +https://github.com/wangshuang233/DPEC. + +
+
+
+
+
+ + ♻ ☆ Foodfusion: A Novel Approach for Food Image Composition via Diffusion + Models + + +
+ Food image composition requires the use of existing dish images and +background images to synthesize a natural new image, while diffusion models +have made significant advancements in image generation, enabling the +construction of end-to-end architectures that yield promising results. However, +existing diffusion models face challenges in processing and fusing information +from multiple images and lack access to high-quality publicly available +datasets, which prevents the application of diffusion models in food image +composition. In this paper, we introduce a large-scale, high-quality food image +composite dataset, FC22k, which comprises 22,000 foreground, background, and +ground truth ternary image pairs. Additionally, we propose a novel food image +composition method, Foodfusion, which leverages the capabilities of the +pre-trained diffusion models and incorporates a Fusion Module for processing +and integrating foreground and background information. This fused information +aligns the foreground features with the background structure by merging the +global structural information at the cross-attention layer of the denoising +UNet. To further enhance the content and structure of the background, we also +integrate a Content-Structure Control Module. Extensive experiments demonstrate +the effectiveness and scalability of our proposed method. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Detecting Brittle Decisions for Free: Leveraging Margin Consistency in + Deep Robust Classifiers + + +
+ Despite extensive research on adversarial training strategies to improve +robustness, the decisions of even the most robust deep learning models can +still be quite sensitive to imperceptible perturbations, creating serious risks +when deploying them for high-stakes real-world applications. While detecting +such cases may be critical, evaluating a model's vulnerability at a +per-instance level using adversarial attacks is computationally too intensive +and unsuitable for real-time deployment scenarios. The input space margin is +the exact score to detect non-robust samples and is intractable for deep neural +networks. This paper introduces the concept of margin consistency -- a property +that links the input space margins and the logit margins in robust models -- +for efficient detection of vulnerable samples. First, we establish that margin +consistency is a necessary and sufficient condition to use a model's logit +margin as a score for identifying non-robust samples. Next, through +comprehensive empirical analysis of various robustly trained models on CIFAR10 +and CIFAR100 datasets, we show that they indicate high margin consistency with +a strong correlation between their input space margins and the logit margins. +Then, we show that we can effectively and confidently use the logit margin to +detect brittle decisions with such models. Finally, we address cases where the +model is not sufficiently margin-consistent by learning a pseudo-margin from +the feature representation. Our findings highlight the potential of leveraging +deep representations to assess adversarial vulnerability in deployment +scenarios efficiently. + +
+
+ comment: 10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready +
+
+
+
+
+ + ♻ ☆ A Framework for Real-Time Volcano-Seismic Event Recognition Based on + Multi-Station Seismograms and Semantic Segmentation Models + + +
+ In volcano monitoring, effective recognition of seismic events is essential +for understanding volcanic activity and raising timely warning alerts. +Traditional methods rely on manual analysis, which can be subjective and +labor-intensive. Furthermore, current automatic approaches often tackle +detection and classification separately, mostly rely on single station +information and generally require tailored preprocessing and representations to +perform predictions. These limitations often hinder their application to +real-time monitoring and utilization across different volcano conditions. This +study introduces a novel approach that utilizes Semantic Segmentation models to +automate seismic event recognition by applying a straight forward +transformation of multi-channel 1D signals into 2D representations, enabling +their use as images. Our framework employs a data-driven, end-to-end design +that integrates multi-station seismic data with minimal preprocessing, +performing both detection and classification simultaneously for five seismic +event classes. We evaluated four state-of-the-art segmentation models (UNet, +UNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events +recorded at four different Chilean volcanoes: Nevados del Chill\'an Volcanic +Complex, Laguna del Maule, Villarrica and Puyehue-Cord\'on Caulle. Among these +models, the UNet architecture was identified as the most effective model, +achieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and +0.88, respectively, and demonstrating superior noise robustness and model +flexibility to unseen volcano datasets. + +
+
+ comment: 10 pages, 9 figures. This is a pre-print, it is currently under + review for publication +
+
+
+
+
+ + ♻ ☆ From Question to Exploration: Test-Time Adaptation in Semantic + Segmentation? + + +
+ Test-time adaptation (TTA) aims to adapt a model, initially trained on +training data, to test data with potential distribution shifts. Most existing +TTA methods focus on classification problems. The pronounced success of +classification might lead numerous newcomers and engineers to assume that +classic TTA techniques can be directly applied to the more challenging task of +semantic segmentation. However, this belief is still an open question. In this +paper, we investigate the applicability of existing classic TTA strategies in +semantic segmentation. Our comprehensive results have led to three key +observations. First, the classic normalization updating strategy only brings +slight performance improvement, and in some cases, it might even adversely +affect the results. Even with the application of advanced distribution +estimation techniques like batch renormalization, the problem remains +unresolved. Second, although the teacher-student scheme does enhance the +training stability for segmentation TTA in the presence of noisy pseudo-labels +and temporal correlation, it cannot directly result in performance improvement +compared to the original model without TTA under complex data distribution. +Third, segmentation TTA suffers a severe long-tailed class-imbalance problem, +which is substantially more complex than that in TTA for classification. This +long-tailed challenge negatively affects segmentation TTA performance, even +when the accuracy of pseudo-labels is high. Besides those observations, we find +that visual prompt tuning (VisPT) is promising in segmentation TTA and propose +a novel method named TTAP. The outstanding performance of TTAP has also been +verified. We hope the community can give more attention to this challenging, +yet important, segmentation TTA task in the future. The source code is +available at: \textit{https://github.com/ycarobot/TTAP + +
+
+
+
+
+ + ♻ ☆ LRM-Zero: Training Large Reconstruction Models with Synthesized Data NeurIPS 2024 + + +
+ We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on +synthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The +core of LRM-Zero is our procedural 3D dataset, Zeroverse, which is +automatically synthesized from simple primitive shapes with random texturing +and augmentations (e.g., height fields, boolean differences, and wireframes). +Unlike previous 3D datasets (e.g., Objaverse) which are often captured or +crafted by humans to approximate real 3D data, Zeroverse completely ignores +realistic global semantics but is rich in complex geometric and texture details +that are locally similar to or even more intricate than real objects. We +demonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse, +can achieve high visual quality in the reconstruction of real-world objects, +competitive with models trained on Objaverse. We also analyze several critical +design choices of Zeroverse that contribute to LRM-Zero's capability and +training stability. Our work demonstrates that 3D reconstruction, one of the +core tasks in 3D vision, can potentially be addressed without the semantics of +real-world objects. The Zeroverse's procedural synthesis code and interactive +visualization are available at: https://desaixie.github.io/lrm-zero/. + +
+
+ comment: 23 pages, 8 figures. Our code and interactive visualization are + available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera + Ready version +
+
+
+
+
+ + ♻ ☆ DiffusionPDE: Generative PDE-Solving Under Partial Observation NeurIPS 2024 + + +
+ We introduce a general framework for solving partial differential equations +(PDEs) using generative diffusion models. In particular, we focus on the +scenarios where we do not have the full knowledge of the scene necessary to +apply classical solvers. Most existing forward or inverse PDE approaches +perform poorly when the observations on the data or the underlying coefficients +are incomplete, which is a common assumption for real-world measurements. In +this work, we propose DiffusionPDE that can simultaneously fill in the missing +information and solve a PDE by modeling the joint distribution of the solution +and coefficient spaces. We show that the learned generative priors lead to a +versatile framework for accurately solving a wide range of PDEs under partial +observation, significantly outperforming the state-of-the-art methods for both +forward and inverse directions. + +
+
+ comment: NeurIPS 2024. Project page: + https://jhhuangchloe.github.io/Diffusion-PDE/ +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ♻ ☆ $\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific + Domain through Complementary Granularity EMNLP 2024 + + +
+ Recent studies show the growing significance of document retrieval in the +generation of LLMs, i.e., RAG, within the scientific domain by bridging their +knowledge gap. However, dense retrievers often struggle with domain-specific +retrieval and complex query-document relationships, particularly when query +segments correspond to various parts of a document. To alleviate such prevalent +challenges, this paper introduces $\texttt{MixGR}$, which improves dense +retrievers' awareness of query-document matching across various levels of +granularity in queries and documents using a zero-shot approach. +$\texttt{MixGR}$ fuses various metrics based on these granularities to a united +score that reflects a comprehensive query-document similarity. Our experiments +demonstrate that $\texttt{MixGR}$ outperforms previous document retrieval by +24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based +retrievers, respectively, averaged on queries containing multiple subqueries +from five scientific retrieval datasets. Moreover, the efficacy of two +downstream scientific question-answering tasks highlights the advantage of +$\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The +code and experimental datasets are available. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Customizing Language Models with Instance-wise LoRA for Sequential + Recommendation + + +
+ Sequential recommendation systems predict the next interaction item based on +users' past interactions, aligning recommendations with individual preferences. +Leveraging the strengths of Large Language Models (LLMs) in knowledge +comprehension and reasoning, recent approaches are eager to apply LLMs to +sequential recommendation. A common paradigm is converting user behavior +sequences into instruction data, and fine-tuning the LLM with +parameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA). +However, the uniform application of LoRA across diverse user behaviors is +insufficient to capture individual variability, resulting in negative transfer +between disparate sequences. To address these challenges, we propose +Instance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation +task as a form of multi-task learning, integrating LoRA with the Mixture of +Experts (MoE) framework. This approach encourages different experts to capture +various aspects of user behavior. Additionally, we introduce a sequence +representation guided gate function that generates customized expert +participation weights for each user sequence, which allows dynamic parameter +adjustment for instance-wise recommendations. In sequential recommendation, +iLoRA achieves an average relative improvement of 11.4\% over basic LoRA in the +hit ratio metric, with less than a 1\% relative increase in trainable +parameters. Extensive experiments on three benchmark datasets demonstrate the +effectiveness of iLoRA, highlighting its superior performance compared to +existing methods in mitigating negative transfer and improving recommendation +accuracy. Our data and code are available at +https://github.com/AkaliKong/iLoRA. + +
+
+
+
+
+ + ♻ ☆ LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential + Recommendation + + +
+ Sequential recommender systems (SRS) aim to predict users' subsequent choices +based on their historical interactions and have found applications in diverse +fields such as e-commerce and social media. However, in real-world systems, +most users interact with only a handful of items, while the majority of items +are seldom consumed. These two issues, known as the long-tail user and +long-tail item challenges, often pose difficulties for existing SRS. These +challenges can adversely affect user experience and seller benefits, making +them crucial to address. Though a few works have addressed the challenges, they +still struggle with the seesaw or noisy issues due to the intrinsic scarcity of +interactions. The advancements in large language models (LLMs) present a +promising solution to these problems from a semantic perspective. As one of the +pioneers in this field, we propose the Large Language Models Enhancement +framework for Sequential Recommendation (LLM-ESR). This framework utilizes +semantic embeddings derived from LLMs to enhance SRS without adding extra +inference load from LLMs. To address the long-tail item challenge, we design a +dual-view modeling framework that combines semantics from LLMs and +collaborative signals from conventional SRS. For the long-tail user challenge, +we propose a retrieval augmented self-distillation method to enhance user +preference representation using more informative interactions from similar +users. To verify the effectiveness and versatility of our proposed enhancement +framework, we conduct extensive experiments on three real-world datasets using +three popular SRS models. The results show that our method surpasses existing +baselines consistently, and benefits long-tail users and items especially. The +implementation code is available at +https://github.com/Applied-Machine-Learning-Lab/LLM-ESR. + +
+
+ comment: accepted by NeruIPS'24 (Spotlight) +
+
+
+
+
+ + ♻ ☆ MACRec: a Multi-Agent Collaboration Framework for Recommendation SIGIR2024 + + +
+ LLM-based agents have gained considerable attention for their decision-making +skills and ability to handle complex tasks. Recognizing the current gap in +leveraging agent capabilities for multi-agent collaboration in recommendation +systems, we introduce MACRec, a novel framework designed to enhance +recommendation systems through multi-agent collaboration. Unlike existing work +on using agents for user/item simulation, we aim to deploy multi-agents to +tackle recommendation tasks directly. In our framework, recommendation tasks +are addressed through the collaborative efforts of various specialized agents, +including Manager, User/Item Analyst, Reflector, Searcher, and Task +Interpreter, with different working flows. Furthermore, we provide application +examples of how developers can easily use MACRec on various recommendation +tasks, including rating prediction, sequential recommendation, conversational +recommendation, and explanation generation of recommendation results. The +framework and demonstration video are publicly available at +https://github.com/wzf2000/MACRec. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+ + ♻ ☆ Unveiling User Satisfaction and Creator Productivity Trade-Offs in + Recommendation Platforms + + +
+ On User-Generated Content (UGC) platforms, recommendation algorithms +significantly impact creators' motivation to produce content as they compete +for algorithmically allocated user traffic. This phenomenon subtly shapes the +volume and diversity of the content pool, which is crucial for the platform's +sustainability. In this work, we demonstrate, both theoretically and +empirically, that a purely relevance-driven policy with low exploration +strength boosts short-term user satisfaction but undermines the long-term +richness of the content pool. In contrast, a more aggressive exploration policy +may slightly compromise user satisfaction but promote higher content creation +volume. Our findings reveal a fundamental trade-off between immediate user +satisfaction and overall content production on UGC platforms. Building on this +finding, we propose an efficient optimization method to identify the optimal +exploration strength, balancing user and creator engagement. Our model can +serve as a pre-deployment audit tool for recommendation algorithms on UGC +platforms, helping to align their immediate objectives with sustainable, +long-term goals. + +
+
+
+
+
+
+
+
+ + Machine Learning 85 + +
+
+
+ + ♻ ☆ Adversarially Robust Decision Transformer NeurIPS 2024 + + +
+ Decision Transformer (DT), as one of the representative Reinforcement +Learning via Supervised Learning (RvS) methods, has achieved strong performance +in offline learning tasks by leveraging the powerful Transformer architecture +for sequential decision-making. However, in adversarial environments, these +methods can be non-robust, since the return is dependent on the strategies of +both the decision-maker and adversary. Training a probabilistic model +conditioned on observed return to predict action can fail to generalize, as the +trajectories that achieve a return in the dataset might have done so due to a +suboptimal behavior adversary. To address this, we propose a worst-case-aware +RvS algorithm, the Adversarially Robust Decision Transformer (ARDT), which +learns and conditions the policy on in-sample minimax returns-to-go. ARDT +aligns the target return with the worst-case return learned through minimax +expectile regression, thereby enhancing robustness against powerful test-time +adversaries. In experiments conducted on sequential games with full data +coverage, ARDT can generate a maximin (Nash Equilibrium) strategy, the solution +with the largest adversarial robustness. In large-scale sequential games and +continuous adversarial RL environments with partial data coverage, ARDT +demonstrates significantly superior robustness to powerful test-time +adversaries and attains higher worst-case returns compared to contemporary DT +methods. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Highly Accurate Real-space Electron Densities with Neural Networks + + +
+ Variational ab-initio methods in quantum chemistry stand out among other +methods in providing direct access to the wave function. This allows in +principle straightforward extraction of any other observable of interest, +besides the energy, but in practice this extraction is often technically +difficult and computationally impractical. Here, we consider the electron +density as a central observable in quantum chemistry and introduce a novel +method to obtain accurate densities from real-space many-electron wave +functions by representing the density with a neural network that captures known +asymptotic properties and is trained from the wave function by score matching +and noise-contrastive estimation. We use variational quantum Monte Carlo with +deep-learning ans\"atze (deep QMC) to obtain highly accurate wave functions +free of basis set errors, and from them, using our novel method, +correspondingly accurate electron densities, which we demonstrate by +calculating dipole moments, nuclear forces, contact densities, and other +density-based properties. + +
+
+ comment: 12 pages, 9 figures in the main text +
+
+
+
+
+ + ♻ ☆ Scalable Training of Trustworthy and Energy-Efficient Predictive Graph + Foundation Models for Atomistic Materials Modeling: A Case Study with + HydraGNN + + +
+ We present our work on developing and training scalable, trustworthy, and +energy-efficient predictive graph foundation models (GFMs) using HydraGNN, a +multi-headed graph convolutional neural network architecture. HydraGNN expands +the boundaries of graph neural network (GNN) computations in both training +scale and data diversity. It abstracts over message passing algorithms, +allowing both reproduction of and comparison across algorithmic innovations +that define nearest-neighbor convolution in GNNs. This work discusses a series +of optimizations that have allowed scaling up the GFMs training to tens of +thousands of GPUs on datasets consisting of hundreds of millions of graphs. Our +GFMs use multi-task learning (MTL) to simultaneously learn graph-level and +node-level properties of atomistic structures, such as energy and atomic +forces. Using over 154 million atomistic structures for training, we illustrate +the performance of our approach along with the lessons learned on two +state-of-the-art United States Department of Energy (US-DOE) supercomputers, +namely the Perlmutter petascale system at the National Energy Research +Scientific Computing Center and the Frontier exascale system at Oak Ridge +Leadership Computing Facility. The HydraGNN architecture enables the GFM to +achieve near-linear strong scaling performance using more than 2,000 GPUs on +Perlmutter and 16,000 GPUs on Frontier. + +
+
+ comment: 51 pages, 32 figures +
+
+
+
+
+ + ♻ ☆ Nyström Kernel Stein Discrepancy + + +
+ Kernel methods underpin many of the most successful approaches in data +science and statistics, and they allow representing probability measures as +elements of a reproducing kernel Hilbert space without loss of information. +Recently, the kernel Stein discrepancy (KSD), which combines Stein's method +with the flexibility of kernel techniques, gained considerable attention. +Through the Stein operator, KSD allows the construction of powerful +goodness-of-fit tests where it is sufficient to know the target distribution up +to a multiplicative constant. However, the typical U- and V-statistic-based KSD +estimators suffer from a quadratic runtime complexity, which hinders their +application in large-scale settings. In this work, we propose a Nystr\"om-based +KSD acceleration -- with runtime $\mathcal O\left(mn+m^3\right)$ for $n$ +samples and $m\ll n$ Nystr\"om points -- , show its $\sqrt{n}$-consistency with +a classical sub-Gaussian assumption, and demonstrate its applicability for +goodness-of-fit testing on a suite of benchmarks. + +
+
+ comment: Broader applicability of main result, consistency of quadratic time + estimator +
+
+
+
+
+ + ♻ ☆ Provable optimal transport with transformers: The essence of depth and + prompt engineering + + +
+ Can we establish provable performance guarantees for transformers? +Establishing such theoretical guarantees is a milestone in developing +trustworthy generative AI. In this paper, we take a step toward addressing this +question by focusing on optimal transport, a fundamental problem at the +intersection of combinatorial and continuous optimization. Leveraging the +computational power of attention layers, we prove that a transformer with fixed +parameters can effectively solve the optimal transport problem in Wasserstein-2 +with entropic regularization for an arbitrary number of points. Consequently, +the transformer can sort lists of arbitrary sizes up to an approximation +factor. Our results rely on an engineered prompt that enables the transformer +to implement gradient descent with adaptive stepsizes on the dual optimal +transport. Combining the convergence analysis of gradient descent with Sinkhorn +dynamics, we establish an explicit approximation bound for optimal transport +with transformers, which improves as depth increases. Our findings provide +novel insights into the essence of prompt engineering and depth for solving +optimal transport. In particular, prompt engineering boosts the algorithmic +expressivity of transformers, allowing them implement an optimization method. +With increasing depth, transformers can simulate several iterations of gradient +descent. + +
+
+
+
+
+ + ♻ ☆ MAMMAL -- Molecular Aligned Multi-Modal Architecture and Language + + +
+ Drug discovery typically consists of multiple steps, including identifying a +target protein key to a disease's etiology, validating that interacting with +this target could prevent symptoms or cure the disease, discovering a small +molecule or biologic therapeutic to interact with it, and optimizing the +candidate molecule through a complex landscape of required properties. Drug +discovery related tasks often involve prediction and generation while +considering multiple entities that potentially interact, which poses a +challenge for typical AI models. For this purpose we present MAMMAL - Molecular +Aligned Multi-Modal Architecture and Language - a method that we applied to +create a versatile multi-task multi-align foundation model that learns from +large-scale biological datasets (2 billion samples) across diverse modalities, +including proteins, small molecules, and genes. We introduce a prompt syntax +that supports a wide range of classification, regression, and generation tasks. +It allows combining different modalities and entity types as inputs and/or +outputs. Our model handles combinations of tokens and scalars and enables the +generation of small molecules and proteins, property prediction, and +transcriptomic lab test predictions. We evaluated the model on 11 diverse +downstream tasks spanning different steps within a typical drug discovery +pipeline, where it reaches new SOTA in 9 tasks and is comparable to SOTA in 2 +tasks. This performance is achieved while using a unified architecture serving +all tasks, in contrast to the original SOTA performance achieved using tailored +architectures. + The model code and pretrained weights are publicly available at +https://github.com/BiomedSciAI/biomed-multi-alignment and +https://huggingface.co/ibm/biomed.omics.bl.sm.ma-ted-458m. + +
+
+
+
+
+ + ♻ ☆ A Study of Plasticity Loss in On-Policy Deep Reinforcement Learning + + +
+ Continual learning with deep neural networks presents challenges distinct +from both the fixed-dataset and convex continual learning regimes. One such +challenge is plasticity loss, wherein a neural network trained in an online +fashion displays a degraded ability to fit new tasks. This problem has been +extensively studied in both supervised learning and off-policy reinforcement +learning (RL), where a number of remedies have been proposed. Still, plasticity +loss has received less attention in the on-policy deep RL setting. Here we +perform an extensive set of experiments examining plasticity loss and a variety +of mitigation methods in on-policy deep RL. We demonstrate that plasticity loss +is pervasive under domain shift in this regime, and that a number of methods +developed to resolve it in other settings fail, sometimes even performing worse +than applying no intervention at all. In contrast, we find that a class of +``regenerative'' methods are able to consistently mitigate plasticity loss in a +variety of contexts, including in gridworld tasks and more challenging +environments like Montezuma's Revenge and ProcGen. + +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Causal Discovery under Latent Confounding using + Ancestral GFlowNets + + +
+ Structure learning is the crux of causal inference. Notably, causal discovery +(CD) algorithms are brittle when data is scarce, possibly inferring imprecise +causal relations that contradict expert knowledge -- especially when +considering latent confounders. To aggravate the issue, most CD methods do not +provide uncertainty estimates, making it hard for users to interpret results +and improve the inference process. Surprisingly, while CD is a human-centered +affair, no works have focused on building methods that both 1) output +uncertainty estimates that can be verified by experts and 2) interact with +those experts to iteratively refine CD. To solve these issues, we start by +proposing to sample (causal) ancestral graphs proportionally to a belief +distribution based on a score function, such as the Bayesian information +criterion (BIC), using generative flow networks. Then, we leverage the +diversity in candidate graphs and introduce an optimal experimental design to +iteratively probe the expert about the relations among variables, effectively +reducing the uncertainty of our belief over ancestral graphs. Finally, we +update our samples to incorporate human feedback via importance sampling. +Importantly, our method does not require causal sufficiency (i.e., unobserved +confounders may exist). Experiments with synthetic observational data show that +our method can accurately sample from distributions over ancestral graphs and +that we can greatly improve inference quality with human aid. + +
+
+
+
+
+ + ♻ ☆ Efficient Adversarial Training in LLMs with Continuous Attacks + + +
+ Large language models (LLMs) are vulnerable to adversarial attacks that can +bypass their safety guardrails. In many domains, adversarial training has +proven to be one of the most promising methods to reliably improve robustness +against such attacks. Yet, in the context of LLMs, current methods for +adversarial training are hindered by the high computational costs required to +perform discrete adversarial attacks at each training iteration. We address +this problem by instead calculating adversarial attacks in the continuous +embedding space of the LLM, which is orders of magnitudes more efficient. We +propose a fast adversarial training algorithm (C-AdvUL) composed of two losses: +the first makes the model robust on continuous embedding attacks computed on an +adversarial behaviour dataset; the second ensures the usefulness of the final +model by fine-tuning on utility data. Moreover, we introduce C-AdvIPO, an +adversarial variant of IPO that does not require utility data for adversarially +robust alignment. Our empirical evaluation on five models from different +families (Gemma, Phi3, Mistral, Zephyr, Llama2) and at different scales (2B, +3.8B, 7B) shows that both algorithms substantially enhance LLM robustness +against discrete attacks (GCG, AutoDAN, PAIR), while maintaining utility. Our +results demonstrate that robustness to continuous perturbations can extrapolate +to discrete threat models. Thereby, we present a path toward scalable +adversarial training algorithms for robustly aligning LLMs. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Diffusion Spectral Representation for Reinforcement Learning NeurIPS 2024 + + +
+ Diffusion-based models have achieved notable empirical successes in +reinforcement learning (RL) due to their expressiveness in modeling complex +distributions. Despite existing methods being promising, the key challenge of +extending existing methods for broader real-world applications lies in the +computational cost at inference time, i.e., sampling from a diffusion model is +considerably slow as it often requires tens to hundreds of iterations to +generate even one sample. To circumvent this issue, we propose to leverage the +flexibility of diffusion models for RL from a representation learning +perspective. In particular, by exploiting the connection between diffusion +models and energy-based models, we develop Diffusion Spectral Representation +(Diff-SR), a coherent algorithm framework that enables extracting sufficient +representations for value functions in Markov decision processes (MDP) and +partially observable Markov decision processes (POMDP). We further demonstrate +how Diff-SR facilitates efficient policy optimization and practical algorithms +while explicitly bypassing the difficulty and inference cost of sampling from +the diffusion model. Finally, we provide comprehensive empirical studies to +verify the benefits of Diff-SR in delivering robust and advantageous +performance across various benchmarks with both fully and partially observable +settings. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On the Limitations of Fractal Dimension as a Measure of Generalization + + +
+ Bounding and predicting the generalization gap of overparameterized neural +networks remains a central open problem in theoretical machine learning. There +is a recent and growing body of literature that proposes the framework of +fractals to model optimization trajectories of neural networks, motivating +generalization bounds and measures based on the fractal dimension of the +trajectory. Notably, the persistent homology dimension has been proposed to +correlate with the generalization gap. This paper performs an empirical +evaluation of these persistent homology-based generalization measures, with an +in-depth statistical analysis. Our study reveals confounding effects in the +observed correlation between generalization and topological measures due to the +variation of hyperparameters. We also observe that fractal dimension fails to +predict generalization of models trained from poor initializations. We lastly +reveal the intriguing manifestation of model-wise double descent in these +topological generalization measures. Our work forms a basis for a deeper +investigation of the causal relationships between fractal geometry, topological +data analysis, and neural network optimization. + +
+
+
+
+
+ + ♻ ☆ SelfCodeAlign: Self-Alignment for Code Generation NeurIPS 2024 + + +
+ Instruction tuning is a supervised fine-tuning approach that significantly +improves the ability of large language models (LLMs) to follow human +instructions. We propose SelfCodeAlign, the first fully transparent and +permissive pipeline for self-aligning code LLMs without extensive human +annotations or distillation. SelfCodeAlign employs the same base model for +inference throughout the data generation process. It first extracts diverse +coding concepts from high-quality seed snippets to generate new tasks. It then +samples multiple responses per task, pairs each with test cases, and validates +them in a sandbox environment. Finally, passing examples are selected for +instruction tuning. In our primary experiments, we use SelfCodeAlign with +CodeQwen1.5-7B to generate a dataset of 74k instruction-response pairs. +Finetuning on this dataset leads to a model that achieves a 67.1 pass@1 on +HumanEval+, surpassing CodeLlama-70B-Instruct despite being ten times smaller. +Across all benchmarks, this finetuned model consistently outperforms the +original version trained with OctoPack, the previous state-of-the-art method +for instruction tuning without human annotations or distillation. Additionally, +we show that SelfCodeAlign is effective across LLMs of various sizes, from 3B +to 33B, and that the base models can benefit more from alignment with their own +data distribution. We further validate each component's effectiveness in our +pipeline, showing that SelfCodeAlign outperforms both direct distillation from +GPT-4o and leading GPT-3.5-based distillation methods, such as OSS-Instruct and +Evol-Instruct. SelfCodeAlign has also led to the creation of +StarCoder2-Instruct, the first fully transparent, permissively licensed, and +self-aligned code LLM that achieves state-of-the-art coding performance. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Flexible Fairness-Aware Learning via Inverse Conditional Permutation + + +
+ Equalized odds, as a popular notion of algorithmic fairness, aims to ensure +that sensitive variables, such as race and gender, do not unfairly influence +the algorithm's prediction when conditioning on the true outcome. Despite rapid +advancements, current research primarily focuses on equalized odds violations +caused by a single sensitive attribute, leaving the challenge of simultaneously +accounting for multiple attributes largely unaddressed. We bridge this gap by +introducing an in-processing fairness-aware learning approach, FairICP, which +integrates adversarial learning with a novel inverse conditional permutation +scheme. FairICP offers a theoretically justified, flexible, and efficient +scheme to promote equalized odds under fairness conditions described by complex +and multidimensional sensitive attributes. The efficacy and adaptability of our +method are demonstrated through both simulation studies and empirical analyses +of real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Simplifying Latent Dynamics with Softly State-Invariant World Models + + +
+ To solve control problems via model-based reasoning or planning, an agent +needs to know how its actions affect the state of the world. The actions an +agent has at its disposal often change the state of the environment in +systematic ways. However, existing techniques for world modelling do not +guarantee that the effect of actions are represented in such systematic ways. +We introduce the Parsimonious Latent Space Model (PLSM), a world model that +regularizes the latent dynamics to make the effect of the agent's actions more +predictable. Our approach minimizes the mutual information between latent +states and the change that an action produces in the agent's latent state, in +turn minimizing the dependence the state has on the dynamics. This makes the +world model softly state-invariant. We combine PLSM with different model +classes used for i) future latent state prediction, ii) planning, and iii) +model-free reinforcement learning. We find that our regularization improves +accuracy, generalization, and performance in downstream tasks, highlighting the +importance of systematic treatment of actions in world models. + +
+
+
+
+
+ + ♻ ☆ Designing User-Centric Behavioral Interventions to Prevent Dysglycemia + with Novel Counterfactual Explanations + + +
+ Monitoring unexpected health events and taking actionable measures to avert +them beforehand is central to maintaining health and preventing disease. +Therefore, a tool capable of predicting adverse health events and offering +users actionable feedback about how to make changes in their diet, exercise, +and medication to prevent abnormal health events could have significant +societal impacts. Counterfactual explanations can provide insights into why a +model made a particular prediction by generating hypothetical instances that +are similar to the original input but lead to a different prediction outcome. +Therefore, counterfactuals can be viewed as a means to design AI-driven health +interventions to not only predict but also prevent adverse health outcomes such +as blood glucose spikes, diabetes, and heart disease. In this paper, we design +\textit{\textbf{ExAct}}, a novel model-agnostic framework for generating +counterfactual explanations for chronic disease prevention and management. +Leveraging insights from adversarial learning, ExAct characterizes the decision +boundary for high-dimensional data and performs a grid search to generate +actionable interventions. ExAct is unique in integrating prior knowledge about +user preferences of feasible explanations into the process of counterfactual +generation. ExAct is evaluated extensively using four real-world datasets and +external simulators. With $82.8\%$ average validity in the simulation-aided +validation, ExAct surpasses the state-of-the-art techniques for generating +counterfactual explanations by at least $10\%$. Besides, counterfactuals from +ExAct exhibit at least $6.6\%$ improved proximity compared to previous +research. + +
+
+
+
+
+ + ♻ ☆ Improving Node Representation by Boosting Target-Aware Contrastive Loss + + +
+ Graphs model complex relationships between entities, with nodes and edges +capturing intricate connections. Node representation learning involves +transforming nodes into low-dimensional embeddings. These embeddings are +typically used as features for downstream tasks. Therefore, their quality has a +significant impact on task performance. Existing approaches for node +representation learning span (semi-)supervised, unsupervised, and +self-supervised paradigms. In graph domains, (semi-)supervised learning often +only optimizes models based on class labels, neglecting other abundant graph +signals, which limits generalization. While self-supervised or unsupervised +learning produces representations that better capture underlying graph signals, +the usefulness of these captured signals for downstream target tasks can vary. +To bridge this gap, we introduce Target-Aware Contrastive Learning +(Target-aware CL) which aims to enhance target task performance by maximizing +the mutual information between the target task and node representations with a +self-supervised learning process. This is achieved through a sampling function, +XGBoost Sampler (XGSampler), to sample proper positive examples for the +proposed Target-Aware Contrastive Loss (XTCL). By minimizing XTCL, Target-aware +CL increases the mutual information between the target task and node +representations, such that model generalization is improved. Additionally, +XGSampler enhances the interpretability of each signal by showing the weights +for sampling the proper positive examples. We show experimentally that XTCL +significantly improves the performance on two target tasks: node classification +and link prediction tasks, compared to state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Dimension-free deterministic equivalents for random feature regression NeurIPS 2024 + + +
+ In this work we investigate the generalization performance of random feature +ridge regression (RFRR). Our main contribution is a general deterministic +equivalent for the test error of RFRR. Specifically, under a certain +concentration property, we show that the test error is well approximated by a +closed-form expression that only depends on the feature map eigenvalues. +Notably, our approximation guarantee is non-asymptotic, multiplicative, and +independent of the feature map dimension -- allowing for infinite-dimensional +features. We expect this deterministic equivalent to hold broadly beyond our +theoretical analysis, and we empirically validate its predictions on various +real and synthetic datasets. As an application, we derive sharp excess error +rates under standard power-law assumptions of the spectrum and target decay. In +particular, we provide a tight result for the smallest number of features +achieving optimal minimax error rate. + +
+
+ comment: NeurIPS 2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ Leveraging Recurrent Neural Networks for Predicting Motor Movements from + Primate Motor Cortex Neural Recordings + + +
+ This paper presents an efficient deep learning solution for decoding motor +movements from neural recordings in non-human primates. An Autoencoder Gated +Recurrent Unit (AEGRU) model was adopted as the model architecture for this +task. The autoencoder is only used during the training stage to achieve better +generalization. Together with the preprocessing techniques, our model achieved +0.71 $R^2$ score, surpassing the baseline models in Neurobench and is ranked +first for $R^2$ in the IEEE BioCAS 2024 Grand Challenge on Neural Decoding. +Model pruning is also applied leading to a reduction of 41.4% of the +multiply-accumulate (MAC) operations with little change in the $R^2$ score +compared to the unpruned model. + +
+
+
+
+
+ + ♻ ☆ Nova: A Practical and Advanced Alignment + + +
+ We introduce Nova, a suite of practical alignment techniques employed in a +series of empirically validated high-performing models. This represents the +first comprehensive account of alignment methodologies, offering valuable +insights for advancing AI research. We investigate the critical components that +enhance model performance during the alignment process, including optimization +methods, data strategies, capability enhancements, and evaluation processes. +The process spans three key stages: Prompt Augmentation System(PAS), Supervised +Fine-Tuning(SFT), and Preference Alignment. The problems encountered, the +solutions applied, and the improvements made are thoroughly recorded. + Through comparisons across well-established benchmarks, we highlight the +technological advancements enabled by Nova Alignment. Importantly, +Qwen2-Nova-72B and Llama3-PBM-Nova-70B are instruct versions of the Qwen2-72B +and Llama-3-70B base models, optimized through Nova. The Nova models show +significant core improvements, with user experience gains of 17% to 28%, and +excels on specialized benchmarks. In open-source benchmark evaluations, both +Qwen2-Nova-72B and Llama3-PBM-Nova-70B consistently outperform their respective +official instruct versions across nearly all datasets. This report aims to +clarify the key technologies behind the alignment process, fostering a deeper +understanding within the community. Llama3-PBM-Nova-70B model is available at +https://huggingface.co/PKU-Baichuan-MLSystemLab/Llama3-PBM-Nova-70B. + +
+
+
+
+
+ + ♻ ☆ Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking + + +
+ Self-Supervised Learning (SSL) is an effective paradigm for learning +representations from unlabeled data, such as text, images, and videos. However, +researchers have recently found that SSL is vulnerable to backdoor attacks. The +attacker can embed hidden SSL backdoors via a few poisoned examples in the +training dataset and maliciously manipulate the behavior of downstream models. +To defend against SSL backdoor attacks, a feasible route is to detect and +remove the poisonous samples in the training set. However, the existing SSL +backdoor defense method fails to detect the poisonous samples precisely. In +this paper, we propose to erase the SSL backdoor by cluster activation masking +and propose a novel PoisonCAM method. After obtaining the threat model trained +on the poisoned dataset, our method can precisely detect poisonous samples +based on the assumption that masking the backdoor trigger can effectively +change the activation of a downstream clustering model. In experiments, our +PoisonCAM achieves 96\% accuracy for backdoor trigger detection compared to 3\% +of the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed +PoisonCAM significantly improves the performance of the trained SSL model under +backdoor attacks compared to the state-of-the-art method. Our code, data, and +trained models will be open once this paper is accepted. + +
+
+
+
+
+ + ♻ ☆ Neur2BiLO: Neural Bilevel Optimization + + +
+ Bilevel optimization deals with nested problems in which a leader takes the +first decision to minimize their objective function while accounting for a +follower's best-response reaction. Constrained bilevel problems with integer +variables are particularly notorious for their hardness. While exact solvers +have been proposed for mixed-integer linear bilevel optimization, they tend to +scale poorly with problem size and are hard to generalize to the non-linear +case. On the other hand, problem-specific algorithms (exact and heuristic) are +limited in scope. Under a data-driven setting in which similar instances of a +bilevel problem are solved routinely, our proposed framework, Neur2BiLO, embeds +a neural network approximation of the leader's or follower's value function, +trained via supervised regression, into an easy-to-solve mixed-integer program. +Neur2BiLO serves as a heuristic that produces high-quality solutions extremely +fast for four applications with linear and non-linear objectives and pure and +mixed-integer variables. + +
+
+
+
+
+ + ♻ ☆ QuanTA: Efficient High-Rank Fine-Tuning of LLMs with Quantum-Informed + Tensor Adaptation + + +
+ We propose Quantum-informed Tensor Adaptation (QuanTA), a novel, +easy-to-implement, fine-tuning method with no inference overhead for +large-scale pre-trained language models. By leveraging quantum-inspired methods +derived from quantum circuit structures, QuanTA enables efficient high-rank +fine-tuning, surpassing the limitations of Low-Rank Adaptation (LoRA)--low-rank +approximation may fail for complicated downstream tasks. Our approach is +theoretically supported by the universality theorem and the rank representation +theorem to achieve efficient high-rank adaptations. Experiments demonstrate +that QuanTA significantly enhances commonsense reasoning, arithmetic reasoning, +and scalability compared to traditional methods. Furthermore, QuanTA shows +superior performance with fewer trainable parameters compared to other +approaches and can be designed to integrate with existing fine-tuning +algorithms for further improvement, providing a scalable and efficient solution +for fine-tuning large language models and advancing state-of-the-art in natural +language processing. + +
+
+
+
+
+ + ♻ ☆ Reinforced In-Context Black-Box Optimization + + +
+ Black-Box Optimization (BBO) has found successful applications in many fields +of science and engineering. Recently, there has been a growing interest in +meta-learning particular components of BBO algorithms to speed up optimization +and get rid of tedious hand-crafted heuristics. As an extension, learning the +entire algorithm from data requires the least labor from experts and can +provide the most flexibility. In this paper, we propose RIBBO, a method to +reinforce-learn a BBO algorithm from offline data in an end-to-end fashion. +RIBBO employs expressive sequence models to learn the optimization histories +produced by multiple behavior algorithms and tasks, leveraging the in-context +learning ability of large models to extract task information and make decisions +accordingly. Central to our method is to augment the optimization histories +with \textit{regret-to-go} tokens, which are designed to represent the +performance of an algorithm based on cumulative regret over the future part of +the histories. The integration of regret-to-go tokens enables RIBBO to +automatically generate sequences of query points that satisfy the user-desired +regret, which is verified by its universally good empirical performance on +diverse problems, including BBO benchmark functions, hyper-parameter +optimization and robot control problems. + +
+
+
+
+
+ + ♻ ☆ Multimodal Fusion on Low-quality Data: A Comprehensive Survey + + +
+ Multimodal fusion focuses on integrating information from multiple modalities +with the goal of more accurate prediction, which has achieved remarkable +progress in a wide range of scenarios, including autonomous driving and medical +diagnosis. However, the reliability of multimodal fusion remains largely +unexplored especially under low-quality data settings. This paper surveys the +common challenges and recent advances of multimodal fusion in the wild and +presents them in a comprehensive taxonomy. From a data-centric view, we +identify four main challenges that are faced by multimodal fusion on +low-quality data, namely (1) noisy multimodal data that are contaminated with +heterogeneous noises, (2) incomplete multimodal data that some modalities are +missing, (3) imbalanced multimodal data that the qualities or properties of +different modalities are significantly different and (4) quality-varying +multimodal data that the quality of each modality dynamically changes with +respect to different samples. This new taxonomy will enable researchers to +understand the state of the field and identify several potential directions. We +also provide discussion for the open problems in this field together with +interesting future research directions. + +
+
+ comment: Feel free to comment on our manuscript: qingyangzhang@tju$.$edu$.$cn +
+
+
+
+
+ + ♻ ☆ Analysis of Bootstrap and Subsampling in High-dimensional Regularized + Regression + + +
+ We investigate popular resampling methods for estimating the uncertainty of +statistical models, such as subsampling, bootstrap and the jackknife, and their +performance in high-dimensional supervised regression tasks. We provide a tight +asymptotic description of the biases and variances estimated by these methods +in the context of generalized linear models, such as ridge and logistic +regression, taking the limit where the number of samples $n$ and dimension $d$ +of the covariates grow at a comparable fixed rate $\alpha\!=\! n/d$. Our +findings are three-fold: i) resampling methods are fraught with problems in +high dimensions and exhibit the double-descent-like behavior typical of these +situations; ii) only when $\alpha$ is large enough do they provide consistent +and reliable error estimations (we give convergence rates); iii) in the +over-parametrized regime $\alpha\!<\!1$ relevant to modern machine learning +practice, their predictions are not consistent, even with optimal +regularization. + +
+
+
+
+
+ + ♻ ☆ Theoretical Foundations of Deep Selective State-Space Models NeurIPS + + +
+ Structured state-space models (SSMs) such as S4, stemming from the seminal +work of Gu et al., are gaining popularity as effective approaches for modeling +sequential data. Deep SSMs demonstrate outstanding performance across a diverse +set of domains, at a reduced training and inference cost compared to +attention-based transformers. Recent developments show that if the linear +recurrence powering SSMs allows for multiplicative interactions between inputs +and hidden states (e.g. GateLoop, Mamba, GLA), then the resulting architecture +can surpass in both in accuracy and efficiency attention-powered foundation +models trained on text, at scales of billion parameters. In this paper, we give +theoretical grounding to this recent finding using tools from Rough Path +Theory: we show that when random linear recurrences are equipped with simple +input-controlled transitions (selectivity mechanism), then the hidden state is +provably a low-dimensional projection of a powerful mathematical object called +the signature of the input -- capturing non-linear interactions between tokens +at distinct timescales. Our theory not only motivates the success of modern +selective state-space models such as Mamba but also provides a solid framework +to understand the expressive power of future SSM variants. + +
+
+ comment: NeurIPS Version w/ minor edits +
+
+
+
+
+ + ♻ ☆ Accelerating Transfer Learning with Near-Data Computation on Cloud + Object Stores SoCC '24 + + +
+ Storage disaggregation underlies today's cloud and is naturally complemented +by pushing down some computation to storage, thus mitigating the potential +network bottleneck between the storage and compute tiers. We show how ML +training benefits from storage pushdowns by focusing on transfer learning (TL), +the widespread technique that democratizes ML by reusing existing knowledge on +related tasks. We propose HAPI, a new TL processing system centered around two +complementary techniques that address challenges introduced by disaggregation. +First, applications must carefully balance execution across tiers for +performance. HAPI judiciously splits the TL computation during the feature +extraction phase yielding pushdowns that not only improve network time but also +improve total TL training time by overlapping the execution of consecutive +training iterations across tiers. Second, operators want resource efficiency +from the storage-side computational resources. HAPI employs storage-side batch +size adaptation allowing increased storage-side pushdown concurrency without +affecting training accuracy. HAPI yields up to 2.5x training speed-up while +choosing in 86.8% of cases the best performing split point or one that is at +most 5% off from the best. + +
+
+ comment: To appear in the proceedings of SoCC '24 +
+
+
+
+
+ + ♻ ☆ Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal + satellite dataset for rapid flood mapping NeurIPS 2024 + + +
+ Global floods, exacerbated by climate change, pose severe threats to human +life, infrastructure, and the environment. Recent catastrophic events in +Pakistan and New Zealand underscore the urgent need for precise flood mapping +to guide restoration efforts, understand vulnerabilities, and prepare for +future occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers +day-and-night, all-weather imaging capabilities, its application in deep +learning for flood segmentation is limited by the lack of large annotated +datasets. To address this, we introduce Kuro Siwo, a manually annotated +multi-temporal dataset, spanning 43 flood events globally. Our dataset maps +more than 338 billion $m^2$ of land, with 33 billion designated as either +flooded areas or permanent water bodies. Kuro Siwo includes a highly processed +product optimized for flood mapping based on SAR Ground Range Detected, and a +primal SAR Single Look Complex product with minimal preprocessing, designed to +promote research on the exploitation of both the phase and amplitude +information and to offer maximum flexibility for downstream task preprocessing. +To leverage advances in large scale self-supervised pretraining methods for +remote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR +samples. Finally, we provide an extensive benchmark, namely BlackBench, +offering strong baselines for a diverse set of flood events from Europe, +America, Africa, Asia and Australia. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems are crucial for cartography, disaster +surveillance, and resource administration. Nonetheless, they encounter +considerable obstacles in the processing and transmission of extensive data, +especially in specialized domains such as precision agriculture and real-time +disaster response. Earth observation satellites, outfitted with remote sensing +technology, gather data from onboard sensors and IoT-enabled terrestrial +objects, delivering important information remotely. Domain-adapted Large +Language Models (LLMs) provide a solution by enabling the integration of raw +and processed EO data. Through domain adaptation, LLMs improve the assimilation +and analysis of many data sources, tackling the intricacies of specialized +datasets in agriculture and disaster response. This data synthesis, directed by +LLMs, enhances the precision and pertinence of conveyed information. This study +provides a thorough examination of using semantic inference and deep learning +for sophisticated EO systems. It presents an innovative architecture for +semantic communication in EO satellite networks, designed to improve data +transmission efficiency using semantic processing methodologies. Recent +advancements in onboard processing technologies enable dependable, adaptable, +and energy-efficient data management in orbit. These improvements guarantee +reliable performance in adverse space circumstances using radiation-hardened +and reconfigurable technology. Collectively, these advancements enable +next-generation satellite missions with improved processing capabilities, +crucial for operational flexibility and real-time decision-making in 6G +satellite communication. + +
+
+ comment: 17 pages, 7 figures, Journal +
+
+
+
+
+ + ♻ ☆ Intruding with Words: Towards Understanding Graph Injection Attacks at + the Text Level NeurIPS 2024 + + +
+ Graph Neural Networks (GNNs) excel across various applications but remain +vulnerable to adversarial attacks, particularly Graph Injection Attacks (GIAs), +which inject malicious nodes into the original graph and pose realistic +threats. Text-attributed graphs (TAGs), where nodes are associated with textual +features, are crucial due to their prevalence in real-world applications and +are commonly used to evaluate these vulnerabilities. However, existing research +only focuses on embedding-level GIAs, which inject node embeddings rather than +actual textual content, limiting their applicability and simplifying detection. +In this paper, we pioneer the exploration of GIAs at the text level, presenting +three novel attack designs that inject textual content into the graph. Through +theoretical and empirical analysis, we demonstrate that text interpretability, +a factor previously overlooked at the embedding level, plays a crucial role in +attack strength. Among the designs we investigate, the Word-frequency-based +Text-level GIA (WTGIA) is particularly notable for its balance between +performance and interpretability. Despite the success of WTGIA, we discover +that defenders can easily enhance their defenses with customized text embedding +methods or large language model (LLM)--based predictors. These insights +underscore the necessity for further research into the potential and practical +significance of text-level GIAs. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Tree Ensembles for Contextual Bandits + + +
+ We propose a new framework for contextual multi-armed bandits based on tree +ensembles. Our framework adapts two widely used bandit methods, Upper +Confidence Bound and Thompson Sampling, for both standard and combinatorial +settings. As part of this framework, we propose a novel method of estimating +the uncertainty in tree ensemble predictions. We further demonstrate the +effectiveness of our framework via several experimental studies, employing +XGBoost and random forests, two popular tree ensemble methods. Compared to +state-of-the-art methods based on decision trees and neural networks, our +methods exhibit superior performance in terms of both regret minimization and +computational runtime, when applied to benchmark datasets and the real-world +application of navigation over road networks. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Bounds and Sensitivity Analysis of the Causal Effect Under + Outcome-Independent MNAR Confounding + + +
+ We report assumption-free bounds for any contrast between the probabilities +of the potential outcome under exposure and non-exposure when the confounders +are missing not at random. We assume that the missingness mechanism is +outcome-independent. We also report a sensitivity analysis method to complement +our bounds. + +
+
+
+
+
+ + ♻ ☆ Unity by Diversity: Improved Representation Learning in Multimodal VAEs + + +
+ Variational Autoencoders for multimodal data hold promise for many tasks in +data analysis, such as representation learning, conditional generation, and +imputation. Current architectures either share the encoder output, decoder +input, or both across modalities to learn a shared representation. Such +architectures impose hard constraints on the model. In this work, we show that +a better latent representation can be obtained by replacing these hard +constraints with a soft constraint. We propose a new mixture-of-experts prior, +softly guiding each modality's latent representation towards a shared aggregate +posterior. This approach results in a superior latent representation and allows +each encoding to preserve information better from its uncompressed original +features. In extensive experiments on multiple benchmark datasets and two +challenging real-world datasets, we show improved learned latent +representations and imputation of missing data modalities compared to existing +methods. + +
+
+ comment: Accepted at Neurips 2024 +
+
+
+
+
+ + ♻ ☆ Efficiency for Free: Ideal Data Are Transportable Representations + + +
+ Data, the seminal opportunity and challenge in modern machine learning, +currently constrains the scalability of representation learning and impedes the +pace of model evolution. In this work, we investigate the efficiency properties +of data from both optimization and generalization perspectives. Our theoretical +and empirical analysis reveals an unexpected finding: for a given task, +utilizing a publicly available, task- and architecture-agnostic model (referred +to as the `prior model' in this paper) can effectively produce efficient data. +Building on this insight, we propose the Representation Learning Accelerator +(\algopt), which promotes the formation and utilization of efficient data, +thereby accelerating representation learning. Utilizing a ResNet-18 pre-trained +on CIFAR-10 as a prior model to inform ResNet-50 training on ImageNet-1K +reduces computational costs by 50% while maintaining the same accuracy as the +model trained with the original BYOL, which requires 100% cost. Our code is +available at: \url{https://github.com/LINs-lab/ReLA}. + +
+
+ comment: Code: https://github.com/LINs-lab/ReLA +
+
+
+
+
+ + ♻ ☆ Improved Generation of Adversarial Examples Against Safety-aligned LLMs + + +
+ Adversarial prompts generated using gradient-based methods exhibit +outstanding performance in performing automatic jailbreak attacks against +safety-aligned LLMs. Nevertheless, due to the discrete nature of texts, the +input gradient of LLMs struggles to precisely reflect the magnitude of loss +change that results from token replacements in the prompt, leading to limited +attack success rates against safety-aligned LLMs, even in the white-box +setting. In this paper, we explore a new perspective on this problem, +suggesting that it can be alleviated by leveraging innovations inspired in +transfer-based attacks that were originally proposed for attacking black-box +image classification models. For the first time, we appropriate the ideologies +of effective methods among these transfer-based attacks, i.e., Skip Gradient +Method and Intermediate Level Attack, into gradient-based adversarial prompt +generation and achieve significant performance gains without introducing +obvious computational cost. Meanwhile, by discussing mechanisms behind the +gains, new insights are drawn, and proper combinations of these methods are +also developed. Our empirical results show that 87% of the query-specific +adversarial suffixes generated by the developed combination can induce +Llama-2-7B-Chat to produce the output that exactly matches the target string on +AdvBench. This match rate is 33% higher than that of a very strong baseline +known as GCG, demonstrating advanced discrete optimization for adversarial +prompt generation against LLMs. In addition, without introducing obvious cost, +the combination achieves >30% absolute increase in attack success rates +compared with GCG when generating both query-specific (38% -> 68%) and +universal adversarial prompts (26.68% -> 60.32%) for attacking the +Llama-2-7B-Chat model on AdvBench. Code at: +https://github.com/qizhangli/Gradient-based-Jailbreak-Attacks. + +
+
+
+
+
+ + ♻ ☆ DASH: Warm-Starting Neural Network Training in Stationary Settings + without Loss of Plasticity NeurIPS 2024 + + +
+ Warm-starting neural network training by initializing networks with +previously learned weights is appealing, as practical neural networks are often +deployed under a continuous influx of new data. However, it often leads to loss +of plasticity, where the network loses its ability to learn new information, +resulting in worse generalization than training from scratch. This occurs even +under stationary data distributions, and its underlying mechanism is poorly +understood. We develop a framework emulating real-world neural network training +and identify noise memorization as the primary cause of plasticity loss when +warm-starting on stationary data. Motivated by this, we propose Direction-Aware +SHrinking (DASH), a method aiming to mitigate plasticity loss by selectively +forgetting memorized noise while preserving learned features. We validate our +approach on vision tasks, demonstrating improvements in test accuracy and +training efficiency. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ TEAM: Topological Evolution-aware Framework for Traffic + Forecasting--Extended Version VLDB 2025 + + +
+ Due to the global trend towards urbanization, people increasingly move to and +live in cities that then continue to grow. Traffic forecasting plays an +important role in the intelligent transportation systems of cities as well as +in spatio-temporal data mining. State-of-the-art forecasting is achieved by +deep-learning approaches due to their ability to contend with complex +spatio-temporal dynamics. However, existing methods assume the input is +fixed-topology road networks and static traffic time series. These assumptions +fail to align with urbanization, where time series are collected continuously +and road networks evolve over time. In such settings, deep-learning models +require frequent re-initialization and re-training, imposing high computational +costs. To enable much more efficient training without jeopardizing model +accuracy, we propose the Topological Evolution-aware Framework (TEAM) for +traffic forecasting that incorporates convolution and attention. This +combination of mechanisms enables better adaptation to newly collected time +series, while being able to maintain learned knowledge from old time series. +TEAM features a continual learning module based on the Wasserstein metric that +acts as a buffer that can identify the most stable and the most changing +network nodes. Then, only data related to stable nodes is employed for +re-training when consolidating a model. Further, only data of new nodes and +their adjacent nodes as well as data pertaining to changing nodes are used to +re-train the model. Empirical studies with two real-world traffic datasets +offer evidence that TEAM is capable of much lower re-training costs than +existing methods are, without jeopardizing forecasting accuracy. + +
+
+ comment: 16 pages. An extended version of "TEAM: Topological Evolution-aware + Framework for Traffic Forecasting" accepted at PVLDB 2025 +
+
+
+
+
+ + ♻ ☆ Dynamical similarity analysis uniquely captures how computations develop + in RNNs + + +
+ Methods for analyzing representations in neural systems are increasingly +popular tools in neuroscience and mechanistic interpretability. Measures +comparing neural activations across conditions, architectures, and species give +scalable ways to understand information transformation within different neural +networks. However, recent findings show that some metrics respond to spurious +signals, leading to misleading results. Establishing benchmark test cases is +thus essential for identifying the most reliable metric and potential +improvements. We propose that compositional learning in recurrent neural +networks (RNNs) can provide a test case for dynamical representation alignment +metrics. Implementing this case allows us to evaluate if metrics can identify +representations that develop throughout learning and determine if +representations identified by metrics reflect the network's actual +computations. Building both attractor and RNN based test cases, we show that +the recently proposed Dynamical Similarity Analysis (DSA) is more noise robust +and reliably identifies behaviorally relevant representations compared to prior +metrics (Procrustes, CKA). We also demonstrate how such test cases can extend +beyond metric evaluation to study new architectures. Specifically, testing DSA +in modern (Mamba) state space models suggests that these models, unlike RNNs, +may not require changes in recurrent dynamics due to their expressive hidden +states. Overall, we develop test cases that showcase how DSA's enhanced ability +to detect dynamical motifs makes it highly effective for identifying ongoing +computations in RNNs and revealing how networks learn tasks. + +
+
+
+
+
+ + ♻ ☆ A Systematic Survey on Large Language Models for Algorithm Design + + +
+ Algorithm Design (AD) is crucial for effective problem-solving across various +domains. The advent of Large Language Models (LLMs) has notably enhanced the +automation and innovation within this field, offering new perspectives and +promising solutions. Over the past three years, the integration of LLMs into AD +(LLM4AD) has seen substantial progress, with applications spanning +optimization, machine learning, mathematical reasoning, and scientific +discovery. Given the rapid advancements and expanding scope of this field, a +systematic review is both timely and necessary. This paper provides a +systematic review of LLM4AD. First, we offer an overview and summary of +existing studies. Then, we introduce a taxonomy and review the literature +across four dimensions: the roles of LLMs, search methods, prompt methods, and +application domains with a discussion of potential and achievements of LLMs in +AD. Finally, we identify current challenges and highlight several promising +directions for future research. + +
+
+
+
+
+ + ♻ ☆ Continuous-time q-learning for mean-field control problems + + +
+ This paper studies the q-learning, recently coined as the continuous time +counterpart of Q-learning by Jia and Zhou (2023), for continuous time +Mckean-Vlasov control problems in the setting of entropy-regularized +reinforcement learning. In contrast to the single agent's control problem in +Jia and Zhou (2023), the mean-field interaction of agents renders the +definition of the q-function more subtle, for which we reveal that two distinct +q-functions naturally arise: (i) the integrated q-function (denoted by $q$) as +the first-order approximation of the integrated Q-function introduced in Gu, +Guo, Wei and Xu (2023), which can be learnt by a weak martingale condition +involving test policies; and (ii) the essential q-function (denoted by $q_e$) +that is employed in the policy improvement iterations. We show that two +q-functions are related via an integral representation under all test policies. +Based on the weak martingale condition and our proposed searching method of +test policies, some model-free learning algorithms are devised. In two +examples, one in LQ control framework and one beyond LQ control framework, we +can obtain the exact parameterization of the optimal value function and +q-functions and illustrate our algorithms with simulation experiments. + +
+
+ comment: Keywords: Continuous-time reinforcement learning, integrated + q-function, mean-field control, weak martingale characterization, test + policies +
+
+
+
+
+ + ♻ ☆ Shortcut-connected Expert Parallelism for Accelerating + Mixture-of-Experts + + +
+ Expert parallelism has been introduced as a strategy to distribute the +computational workload of sparsely-gated mixture-of-experts (MoE) models across +multiple computing devices, facilitating the execution of these increasingly +large-scale models. However, the All-to-All communication intrinsic to expert +parallelism constitutes a significant overhead, diminishing the MoE models' +efficiency. Current optimization approaches offer some relief, yet they are +constrained by the sequential interdependence of communication and computation +operations. To address this limitation, we present a novel shortcut-connected +MoE (ScMoE) architecture with an overlapping parallel strategy, which +effectively decouples communication from its conventional sequence, allowing +for a substantial overlap of 70% to 100% with computation. When compared with +the prevalent top-2 MoE architecture, ScMoE demonstrates training speed +improvements of 30% and 11%, and inference improvements of 40% and 15%, in our +distributed environments with PCIe and NVLink hardware, respectively, where +communication constitutes 60% and 15% of the total MoE time consumption. +Building on the ScMoE architecture, we further implement an expert offloading +strategy to facilitate memory-limited inference, optimizing latency through the +overlap of expert migration. Additionally, extensive experiments and +theoretical analyses indicate that ScMoE not only achieves comparable but in +some instances surpasses the model quality of existing approaches. + +
+
+
+
+
+ + ♻ ☆ Block Transformer: Global-to-Local Language Modeling for Fast Inference + + +
+ We introduce the Block Transformer which adopts hierarchical global-to-local +modeling to autoregressive transformers to mitigate the inference bottlenecks +associated with self-attention. Self-attention requires the key-value (KV) +cache of all previous sequences to be retrieved from memory at every decoding +step to retrieve context information, leading to two primary bottlenecks during +batch inference. First, there is a significant delay in obtaining the first +token, as the information of the entire prompt must first be processed to +prefill the KV cache. Second, computation of subsequent tokens is bottlenecked +by the high memory I/O demand of fetching the entire KV cache, which grows +linearly with sequence length, incurring quadratic memory reads overall. We +design the Block Transformer to strategically mitigate these costs, by +incorporating coarsity and locality into an integrated global-to-local +architecture. At the lower layers, we aggregate tokens into fixed size blocks +to apply attention across the entire sequence at coarse-grained detail, to +capture the global context while minimizing KV cache overhead. At upper layers, +we apply attention within each block to decode individual tokens, to model +fine-grained details with a lightweight local KV cache. We pretrain vanilla and +Block Transformers from scratch and demonstrate that Block Transformers reach +10--20x inference throughput compared to vanilla transformers with equivalent +perplexity and zero-shot task performance. Code is available at +https://github.com/itsnamgyu/block-transformer. + +
+
+ comment: 37 pages, 24 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ CrysToGraph: A Comprehensive Predictive Model for Crystal Materials + Properties and the Benchmark + + +
+ The ionic bonding across the lattice and ordered microscopic structures endow +crystals with unique symmetry and determine their macroscopic properties. +Unconventional crystals, in particular, exhibit non-traditional lattice +structures or possess exotic physical properties, making them intriguing +subjects for investigation. Therefore, to accurately predict the physical and +chemical properties of crystals, it is crucial to consider long-range orders. +While GNN excels at capturing the local environment of atoms in crystals, they +often face challenges in effectively capturing longer-ranged interactions due +to their limited depth. In this paper, we propose CrysToGraph +($\textbf{Crys}$tals with $\textbf{T}$ransformers $\textbf{o}$n +$\textbf{Graph}$s), a novel transformer-based geometric graph network designed +specifically for unconventional crystalline systems, and UnconvBench, a +comprehensive benchmark to evaluate models' predictive performance on +unconventional crystal materials such as defected crystals, low-dimension +crystals and MOF. CrysToGraph effectively captures short-range interactions +with transformer-based graph convolution blocks as well as long-range +interactions with graph-wise transformer blocks. CrysToGraph proofs its +effectiveness in modelling unconventional crystal materials in multiple tasks, +and moreover, it outperforms most existing methods, achieving new +state-of-the-art results on the benchmarks of both unconventional crystals and +traditional crystals. + +
+
+
+
+
+ + ♻ ☆ Graph Convolutions Enrich the Self-Attention in Transformers! NeurIPS 2024 + + +
+ Transformers, renowned for their self-attention mechanism, have achieved +state-of-the-art performance across various tasks in natural language +processing, computer vision, time-series modeling, etc. However, one of the +challenges with deep Transformer models is the oversmoothing problem, where +representations across layers converge to indistinguishable values, leading to +significant performance degradation. We interpret the original self-attention +as a simple graph filter and redesign it from a graph signal processing (GSP) +perspective. We propose a graph-filter-based self-attention (GFSA) to learn a +general yet effective one, whose complexity, however, is slightly larger than +that of the original self-attention mechanism. We demonstrate that GFSA +improves the performance of Transformers in various fields, including computer +vision, natural language processing, graph-level tasks, speech recognition, and +code classification. + +
+
+ comment: Accepted to NeurIPS 2024. Jeongwhan Choi and Hyowon Wi are co-first + authors with equal contributions +
+
+
+
+
+ + ♻ ☆ Breaking Determinism: Fuzzy Modeling of Sequential Recommendation Using + Discrete State Space Diffusion Model NeurIPS'2024 + + +
+ Sequential recommendation (SR) aims to predict items that users may be +interested in based on their historical behavior sequences. We revisit SR from +a novel information-theoretic perspective and find that conventional sequential +modeling methods fail to adequately capture the randomness and unpredictability +of user behavior. Inspired by fuzzy information processing theory, this paper +introduces the DDSR model, which uses fuzzy sets of interaction sequences to +overcome the limitations and better capture the evolution of users' real +interests. Formally based on diffusion transition processes in discrete state +spaces, which is unlike common diffusion models such as DDPM that operate in +continuous domains. It is better suited for discrete data, using structured +transitions instead of arbitrary noise introduction to avoid information loss. +Additionally, to address the inefficiency of matrix transformations due to the +vast discrete space, we use semantic labels derived from quantization or RQ-VAE +to replace item IDs, enhancing efficiency and improving cold start issues. +Testing on three public benchmark datasets shows that DDSR outperforms existing +state-of-the-art methods in various settings, demonstrating its potential and +effectiveness in handling SR tasks. + +
+
+ comment: NeurIPS'2024, 10 pages +
+
+
+
+
+ + ♻ ☆ $FastDoc$: Domain-Specific Fast Continual Pre-training Technique using + Document-Level Metadata and Taxonomy + + +
+ In this paper, we propose $FastDoc$ (Fast Continual Pre-training Technique +using Document Level Metadata and Taxonomy), a novel, compute-efficient +framework that utilizes Document metadata and Domain-Specific Taxonomy as +supervision signals to continually pre-train transformer encoder on a +domain-specific corpus. The main innovation is that during domain-specific +pretraining, an open-domain encoder is continually pre-trained using +sentence-level embeddings as inputs (to accommodate long documents), however, +fine-tuning is done with token-level embeddings as inputs to this encoder. We +perform such domain-specific pre-training on three different domains namely +customer support, scientific, and legal domains, and compare performance on 6 +different downstream tasks and 9 different datasets. The novel use of +document-level supervision along with sentence-level embedding input for +pre-training reduces pre-training compute by around $1,000$, $4,500$, and $500$ +times compared to MLM and/or NSP in Customer Support, Scientific, and Legal +Domains, respectively. The reduced training time does not lead to a +deterioration in performance. In fact we show that $FastDoc$ either outperforms +or performs on par with several competitive transformer-based baselines in +terms of character-level F1 scores and other automated metrics in the Customer +Support, Scientific, and Legal Domains. Moreover, reduced training aids in +mitigating the risk of catastrophic forgetting. Thus, unlike baselines, +$FastDoc$ shows a negligible drop in performance on open domain. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR), 36 + pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Adversarial Representation Engineering: A General Model Editing + Framework for Large Language Models NeurIPS 2024 + + +
+ Since the rapid development of Large Language Models (LLMs) has achieved +remarkable success, understanding and rectifying their internal complex +mechanisms has become an urgent issue. Recent research has attempted to +interpret their behaviors through the lens of inner representation. However, +developing practical and efficient methods for applying these representations +for general and flexible model editing remains challenging. In this work, we +explore how to leverage insights from representation engineering to guide the +editing of LLMs by deploying a representation sensor as an editing oracle. We +first identify the importance of a robust and reliable sensor during editing, +then propose an Adversarial Representation Engineering (ARE) framework to +provide a unified and interpretable approach for conceptual model editing +without compromising baseline performance. Experiments on multiple tasks +demonstrate the effectiveness of ARE in various model editing scenarios. Our +code and data are available at +https://github.com/Zhang-Yihao/Adversarial-Representation-Engineering. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Offline Reinforcement Learning with OOD State Correction and OOD Action + Suppression NeurIPS 2024 + + +
+ In offline reinforcement learning (RL), addressing the out-of-distribution +(OOD) action issue has been a focus, but we argue that there exists an OOD +state issue that also impairs performance yet has been underexplored. Such an +issue describes the scenario when the agent encounters states out of the +offline dataset during the test phase, leading to uncontrolled behavior and +performance degradation. To this end, we propose SCAS, a simple yet effective +approach that unifies OOD state correction and OOD action suppression in +offline RL. Technically, SCAS achieves value-aware OOD state correction, +capable of correcting the agent from OOD states to high-value in-distribution +states. Theoretical and empirical results show that SCAS also exhibits the +effect of suppressing OOD actions. On standard offline RL benchmarks, SCAS +achieves excellent performance without additional hyperparameter tuning. +Moreover, benefiting from its OOD state correction feature, SCAS demonstrates +enhanced robustness against environmental perturbations. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Are large language models superhuman chemists? + + +
+ Large language models (LLMs) have gained widespread interest due to their +ability to process human language and perform tasks on which they have not been +explicitly trained. + However, we possess only a limited systematic understanding of the chemical +capabilities of LLMs, which would be required to improve models and mitigate +potential harm. Here, we introduce "ChemBench," an automated framework for +evaluating the chemical knowledge and reasoning abilities of state-of-the-art +LLMs against the expertise of chemists. + We curated more than 2,700 question-answer pairs, evaluated leading open- and +closed-source LLMs, and found that the best models outperformed the best human +chemists in our study on average. However, the models struggle with some basic +tasks and provide overconfident predictions. + These findings reveal LLMs' impressive chemical capabilities while +emphasizing the need for further research to improve their safety and +usefulness. They also suggest adapting chemistry education and show the value +of benchmarking frameworks for evaluating LLMs in specific domains. + +
+
+
+
+
+ + ♻ ☆ Long Term Memory: The Foundation of AI Self-Evolution + + +
+ Large language models (LLMs) like GPTs, trained on vast datasets, have +demonstrated impressive capabilities in language understanding, reasoning, and +planning, achieving human-level performance in various tasks. Most studies +focus on enhancing these models by training on ever-larger datasets to build +more powerful foundation models. While training stronger models is important, +enabling models to evolve during inference is equally crucial, a process we +refer to as AI self-evolution. Unlike large-scale training, self-evolution may +rely on limited data or interactions. Inspired by the columnar organization of +the human cerebral cortex, we hypothesize that AI models could develop +cognitive abilities and build internal representations through iterative +interactions with their environment. To achieve this, models need long-term +memory (LTM) to store and manage processed interaction data. LTM supports +self-evolution by representing diverse experiences across environments and +agents. In this report, we explore AI self-evolution and its potential to +enhance models during inference. We examine LTM's role in lifelong learning, +allowing models to evolve based on accumulated interactions. We outline the +structure of LTM and the systems needed for effective data retention and +representation. We also classify approaches for building personalized models +with LTM data and show how these models achieve self-evolution through +interaction. Using LTM, our multi-agent framework OMNE achieved first place on +the GAIA benchmark, demonstrating LTM's potential for AI self-evolution. +Finally, we present a roadmap for future research, emphasizing the importance +of LTM for advancing AI technology and its practical applications. + +
+
+ comment: 56 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Pretraining Codomain Attention Neural Operators for Solving Multiphysics + PDEs + + +
+ Existing neural operator architectures face challenges when solving +multiphysics problems with coupled partial differential equations (PDEs) due to +complex geometries, interactions between physical variables, and the limited +amounts of high-resolution training data. To address these issues, we propose +Codomain Attention Neural Operator (CoDA-NO), which tokenizes functions along +the codomain or channel space, enabling self-supervised learning or pretraining +of multiple PDE systems. Specifically, we extend positional encoding, +self-attention, and normalization layers to function spaces. CoDA-NO can learn +representations of different PDE systems with a single model. We evaluate +CoDA-NO's potential as a backbone for learning multiphysics PDEs over multiple +systems by considering few-shot learning settings. On complex downstream tasks +with limited data, such as fluid flow simulations, fluid-structure +interactions, and Rayleigh-B\'enard convection, we found CoDA-NO to outperform +existing methods by over 36%. + +
+
+
+
+
+ + ♻ ☆ InfoRM: Mitigating Reward Hacking in RLHF via Information-Theoretic + Reward Modeling NeurIPS 2024 + + +
+ Despite the success of reinforcement learning from human feedback (RLHF) in +aligning language models with human values, reward hacking, also termed reward +overoptimization, remains a critical challenge. This issue primarily arises +from reward misgeneralization, where reward models (RMs) compute reward using +spurious features that are irrelevant to human preferences. In this work, we +tackle this problem from an information-theoretic perspective and propose a +framework for reward modeling, namely InfoRM, by introducing a variational +information bottleneck objective to filter out irrelevant information. Notably, +we further identify a correlation between overoptimization and outliers in the +IB latent space of InfoRM, establishing it as a promising tool for detecting +reward overoptimization. Inspired by this finding, we propose the Cluster +Separation Index (CSI), which quantifies deviations in the IB latent space, as +an indicator of reward overoptimization to facilitate the development of online +mitigation strategies. Extensive experiments on a wide range of settings and RM +scales (70M, 440M, 1.4B, and 7B) demonstrate the effectiveness of InfoRM. +Further analyses reveal that InfoRM's overoptimization detection mechanism is +not only effective but also robust across a broad range of datasets, signifying +a notable advancement in the field of RLHF. The code will be released upon +acceptance. + +
+
+ comment: The paper has been accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Fast Samplers for Inverse Problems in Iterative Refinement Models NeurIPS'24 + + +
+ Constructing fast samplers for unconditional diffusion and flow-matching +models has received much attention recently; however, existing methods for +solving inverse problems, such as super-resolution, inpainting, or deblurring, +still require hundreds to thousands of iterative steps to obtain high-quality +results. We propose a plug-and-play framework for constructing efficient +samplers for inverse problems, requiring only pre-trained diffusion or +flow-matching models. We present Conditional Conjugate Integrators, which +leverage the specific form of the inverse problem to project the respective +conditional diffusion/flow dynamics into a more amenable space for sampling. +Our method complements popular posterior approximation methods for solving +inverse problems using diffusion/flow models. We evaluate the proposed method's +performance on various linear image restoration tasks across multiple datasets, +employing diffusion and flow-matching models. Notably, on challenging inverse +problems like 4x super-resolution on the ImageNet dataset, our method can +generate high-quality samples in as few as 5 conditional sampling steps and +outperforms competing baselines requiring 20-1000 steps. Our code will be +publicly available at https://github.com/mandt-lab/c-pigdm + +
+
+ comment: 43 pages, NeurIPS'24 Camera Ready +
+
+
+
+
+ + ♻ ☆ In-Context Transfer Learning: Demonstration Synthesis by Transferring + Similar Tasks + + +
+ In-context learning (ICL) is an effective approach to help large language +models (LLMs) adapt to various tasks by providing demonstrations of the target +task. Considering the high cost of labeling demonstrations, many methods +propose synthesizing demonstrations from scratch using LLMs. However, the +quality of the demonstrations synthesized from scratch is limited by the +capabilities and knowledge of LLMs. To address this, inspired by transfer +learning, we propose In-Context Transfer Learning (ICTL), which synthesizes +target task demonstrations by transferring labeled demonstrations from similar +source tasks. ICTL consists of two steps: source sampling and target transfer. +First, we define an optimization objective, which minimizes transfer error to +sample source demonstrations similar to the target task. Then, we employ LLMs +to transfer the sampled source demonstrations to the target task, matching the +definition and format of the target task. Experiments on Super-NI show that +ICTL outperforms synthesis from scratch by 2.0% on average, demonstrating the +effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ FoundTS: Comprehensive and Unified Benchmarking of Foundation Models for + Time Series Forecasting + + +
+ Time Series Forecasting (TSF) is key functionality in numerous fields, +including in finance, weather services, and energy management. While TSF +methods are emerging these days, many of them require domain-specific data +collection and model training and struggle with poor generalization performance +on new domains. Foundation models aim to overcome this limitation. Pre-trained +on large-scale language or time series data, they exhibit promising inferencing +capabilities in new or unseen data. This has spurred a surge in new TSF +foundation models. We propose a new benchmark, FoundTS, to enable thorough and +fair evaluation and comparison of such models. FoundTS covers a variety of TSF +foundation models, including those based on large language models and those +pretrained on time series. Next, FoundTS supports different forecasting +strategies, including zero-shot, few-shot, and full-shot, thereby facilitating +more thorough evaluations. Finally, FoundTS offers a pipeline that standardizes +evaluation processes such as dataset splitting, loading, normalization, and +few-shot sampling, thereby facilitating fair evaluations. Building on this, we +report on an extensive evaluation of TSF foundation models on a broad range of +datasets from diverse domains and with different statistical characteristics. +Specifically, we identify pros and cons and inherent limitations of existing +foundation models, and we identify directions for future model design. We make +our code and datasets available at +https://anonymous.4open.science/r/FoundTS-C2B0. + +
+
+
+
+
+ + ♻ ☆ Make Continual Learning Stronger via C-Flat + + +
+ Model generalization ability upon incrementally acquiring dynamically +updating knowledge from sequentially arriving tasks is crucial to tackle the +sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape +sharpness minimization seeking for flat minima lying in neighborhoods with +uniform low loss or smooth gradient is proven to be a strong training regime +improving model generalization compared with loss minimization based optimizer +like SGD. Yet only a few works have discussed this training regime for CL, +proving that dedicated designed zeroth-order sharpness optimizer can improve CL +performance. In this work, we propose a Continual Flatness (C-Flat) method +featuring a flatter loss landscape tailored for CL. C-Flat could be easily +called with only one line of code and is plug-and-play to any CL methods. A +general framework of C-Flat applied to all CL categories and a thorough +comparison with loss minima optimizer and flat minima based CL approaches is +presented in this paper, showing that our method can boost CL performance in +almost all cases. Code is available at https://github.com/WanNaa/C-Flat. + +
+
+
+
+
+ + ♻ ☆ QGFN: Controllable Greediness with Action Values NeurIPS 2024 + + +
+ Generative Flow Networks (GFlowNets; GFNs) are a family of energy-based +generative methods for combinatorial objects, capable of generating diverse and +high-utility samples. However, consistently biasing GFNs towards producing +high-utility samples is non-trivial. In this work, we leverage connections +between GFNs and reinforcement learning (RL) and propose to combine the GFN +policy with an action-value estimate, $Q$, to create greedier sampling policies +which can be controlled by a mixing parameter. We show that several variants of +the proposed method, QGFN, are able to improve on the number of high-reward +samples generated in a variety of tasks without sacrificing diversity. + +
+
+ comment: Accepted by 38th Conference on Neural Information Processing Systems + (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Learning Macroscopic Dynamics from Partial Microscopic Observations + + +
+ Macroscopic observables of a system are of keen interest in real applications +such as the design of novel materials. Current methods rely on microscopic +trajectory simulations, where the forces on all microscopic coordinates need to +be computed or measured. However, this can be computationally prohibitive for +realistic systems. In this paper, we propose a method to learn macroscopic +dynamics requiring only force computations on a subset of the microscopic +coordinates. Our method relies on a sparsity assumption: the force on each +microscopic coordinate relies only on a small number of other coordinates. The +main idea of our approach is to map the training procedure on the macroscopic +coordinates back to the microscopic coordinates, on which partial force +computations can be used as stochastic estimation to update model parameters. +We provide a theoretical justification of this under suitable conditions. We +demonstrate the accuracy, force computation efficiency, and robustness of our +method on learning macroscopic closure models from a variety of microscopic +systems, including those modeled by partial differential equations or molecular +dynamics simulations. + +
+
+
+
+
+ + ♻ ☆ Equitable Federated Learning with Activation Clustering + + +
+ Federated learning is a prominent distributed learning paradigm that +incorporates collaboration among diverse clients, promotes data locality, and +thus ensures privacy. These clients have their own technological, cultural, and +other biases in the process of data generation. However, the present standard +often ignores this bias/heterogeneity, perpetuating bias against certain groups +rather than mitigating it. In response to this concern, we propose an equitable +clustering-based framework where the clients are categorized/clustered based on +how similar they are to each other. We propose a unique way to construct the +similarity matrix that uses activation vectors. Furthermore, we propose a +client weighing mechanism to ensure that each cluster receives equal importance +and establish $O(1/\sqrt{K})$ rate of convergence to reach an +$\epsilon-$stationary solution. We assess the effectiveness of our proposed +strategy against common baselines, demonstrating its efficacy in terms of +reducing the bias existing amongst various client clusters and consequently +ameliorating algorithmic bias against specific groups. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Towards Understanding How Transformers Learn In-context Through a + Representation Learning Lens + + +
+ Pre-trained large language models based on Transformers have demonstrated +remarkable in-context learning (ICL) abilities. With just a few demonstration +examples, the models can implement new tasks without any parameter updates. +However, it is still an open question to understand the mechanism of ICL. In +this paper, we attempt to explore the ICL process in Transformers through a +lens of representation learning. Initially, leveraging kernel methods, we +figure out a dual model for one softmax attention layer. The ICL inference +process of the attention layer aligns with the training procedure of its dual +model, generating token representation predictions that are equivalent to the +dual model's test outputs. We delve into the training process of this dual +model from a representation learning standpoint and further derive a +generalization error bound related to the quantity of demonstration tokens. +Subsequently, we extend our theoretical conclusions to more complicated +scenarios, including one Transformer layer and multiple attention layers. +Furthermore, drawing inspiration from existing representation learning methods +especially contrastive learning, we propose potential modifications for the +attention layer. Finally, experiments are designed to support our findings. + +
+
+ comment: 35 pages +
+
+
+
+
+ + ♻ ☆ IntraMix: Intra-Class Mixup Generation for Accurate Labels and Neighbors NeurIPS2024 + + +
+ Graph Neural Networks (GNNs) have shown great performance in various tasks, +with the core idea of learning from data labels and aggregating messages within +the neighborhood of nodes. However, the common challenges in graphs are +twofold: insufficient accurate (high-quality) labels and limited neighbors for +nodes, resulting in weak GNNs. Existing graph augmentation methods typically +address only one of these challenges, often adding training costs or relying on +oversimplified or knowledge-intensive strategies, limiting their +generalization. To simultaneously address both challenges faced by graphs in a +generalized way, we propose an elegant method called IntraMix. Considering the +incompatibility of vanilla Mixup with the complex topology of graphs, IntraMix +innovatively employs Mixup among inaccurate labeled data of the same class, +generating high-quality labeled data at minimal cost. Additionally, it finds +data with high confidence of being clustered into the same group as the +generated data to serve as their neighbors, thereby enriching the neighborhoods +of graphs. IntraMix efficiently tackles both issues faced by graphs and +challenges the prior notion of the limited effectiveness of Mixup in node +classification. IntraMix is a theoretically grounded plug-in-play method that +can be readily applied to all GNNs. Extensive experiments demonstrate the +effectiveness of IntraMix across various GNNs and datasets. Our code is +available at: https://github.com/Zhengsh123/IntraMix. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ When Large Language Models Meet Vector Databases: A Survey + + +
+ This survey explores the synergistic potential of Large Language Models +(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving +research area. With the proliferation of LLMs comes a host of challenges, +including hallucinations, outdated knowledge, prohibitive commercial +application costs, and memory issues. VecDBs emerge as a compelling solution to +these issues by offering an efficient means to store, retrieve, and manage the +high-dimensional vector representations intrinsic to LLM operations. Through +this nuanced review, we delineate the foundational principles of LLMs and +VecDBs and critically analyze their integration's impact on enhancing LLM +functionalities. This discourse extends into a discussion on the speculative +future developments in this domain, aiming to catalyze further research into +optimizing the confluence of LLMs and VecDBs for advanced data handling and +knowledge extraction capabilities. + +
+
+
+
+
+ + ♻ ☆ Neural Network Matrix Product Operator: A Multi-Dimensionally Integrable + Machine Learning Potential + + +
+ A neural network-based machine learning potential energy surface (PES) +expressed in a matrix product operator (NN-MPO) is proposed. The MPO form +enables efficient evaluation of high-dimensional integrals that arise in +solving the time-dependent and time-independent Schr\"odinger equation and +effectively overcomes the so-called curse of dimensionality. This starkly +contrasts with other neural network-based machine learning PES methods, such as +multi-layer perceptrons (MLPs), where evaluating high-dimensional integrals is +not straightforward due to the fully connected topology in their backbone +architecture. Nevertheless, the NN-MPO retains the high representational +capacity of neural networks. NN-MPO can achieve spectroscopic accuracy with a +test mean absolute error (MAE) of 3.03 cm$^{-1}$ for a fully coupled +six-dimensional ab initio PES, using only 625 training points distributed +across a 0 to 17,000 cm$^{-1}$ energy range. Our Python implementation is +available at https://github.com/KenHino/Pompon. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Achieving the effective design and improvement of reward functions in +reinforcement learning (RL) tasks with complex custom environments and multiple +requirements presents considerable challenges. In this paper, we propose ERFSL, +an efficient reward function searcher using LLMs, which enables LLMs to be +effective white-box searchers and highlights their advanced semantic +understanding capabilities. Specifically, we generate reward components for +each numerically explicit user requirement and employ a reward critic to +identify the correct code form. Then, LLMs assign weights to the reward +components to balance their values and iteratively adjust the weights without +ambiguity and redundant adjustments by flexibly adopting directional mutation +and crossover strategies, similar to genetic algorithms, based on the context +provided by the training log analyzer. We applied the framework to an +underwater data collection RL task without direct human feedback or reward +examples (zero-shot learning). The reward critic successfully corrects the +reward code with only one feedback instance for each requirement, effectively +preventing unrectifiable errors. The initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without the need for weight search. Even in cases where a weight is 500 times +off, on average, only 5.2 iterations are needed to meet user requirements. The +ERFSL also works well with most prompts utilizing GPT-4o mini, as we decompose +the weight searching process to reduce the requirement for numerical and +long-context understanding capabilities + +
+
+
+
+
+ + ♻ ☆ Model-agnostic clean-label backdoor mitigation in cybersecurity + environments + + +
+ The training phase of machine learning models is a delicate step, especially +in cybersecurity contexts. Recent research has surfaced a series of insidious +training-time attacks that inject backdoors in models designed for security +classification tasks without altering the training labels. With this work, we +propose new techniques that leverage insights in cybersecurity threat models to +effectively mitigate these clean-label poisoning attacks, while preserving the +model utility. By performing density-based clustering on a carefully chosen +feature subspace, and progressively isolating the suspicious clusters through a +novel iterative scoring procedure, our defensive mechanism can mitigate the +attacks without requiring many of the common assumptions in the existing +backdoor defense literature. To show the generality of our proposed mitigation, +we evaluate it on two clean-label model-agnostic attacks on two different +classic cybersecurity data modalities: network flows classification and malware +classification, using gradient boosting and neural network models. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Suicide Detection on Social Media + with Limited Labels + + +
+ The increasing frequency of suicidal thoughts highlights the importance of +early detection and intervention. Social media platforms, where users often +share personal experiences and seek help, could be utilized to identify +individuals at risk. However, the large volume of daily posts makes manual +review impractical. This paper explores the use of Large Language Models (LLMs) +to automatically detect suicidal content in text-based social media posts. We +propose a novel method for generating pseudo-labels for unlabeled data by +prompting LLMs, along with traditional classification fine-tuning techniques to +enhance label accuracy. To create a strong suicide detection model, we develop +an ensemble approach involving prompting with Qwen2-72B-Instruct, and using +fine-tuned models such as Llama3-8B, Llama3.1-8B, and Gemma2-9B. We evaluate +our approach on the dataset of the Suicide Ideation Detection on Social Media +Challenge, a track of the IEEE Big Data 2024 Big Data Cup. Additionally, we +conduct a comprehensive analysis to assess the impact of different models and +fine-tuning strategies on detection performance. Experimental results show that +the ensemble model significantly improves the detection accuracy, by 5% points +compared with the individual models. It achieves a weight F1 score of 0.770 on +the public test set, and 0.731 on the private test set, providing a promising +solution for identifying suicidal content in social media. Our analysis shows +that the choice of LLMs affects the prompting performance, with larger models +providing better accuracy. Our code and checkpoints are publicly available at +https://github.com/khanhvynguyen/Suicide_Detection_LLMs. + +
+
+ comment: Accepted at IEEE International Conference on Big Data 2024 +
+
+
+
+
+ + ♻ ☆ OSLO: One-Shot Label-Only Membership Inference Attacks NeurIPS 2024 + + +
+ We introduce One-Shot Label-Only (OSLO) membership inference attacks (MIAs), +which accurately infer a given sample's membership in a target model's training +set with high precision using just \emph{a single query}, where the target +model only returns the predicted hard label. This is in contrast to +state-of-the-art label-only attacks which require $\sim6000$ queries, yet get +attack precisions lower than OSLO's. OSLO leverages transfer-based black-box +adversarial attacks. The core idea is that a member sample exhibits more +resistance to adversarial perturbations than a non-member. We compare OSLO +against state-of-the-art label-only attacks and demonstrate that, despite +requiring only one query, our method significantly outperforms previous attacks +in terms of precision and true positive rate (TPR) under the same false +positive rates (FPR). For example, compared to previous label-only MIAs, OSLO +achieves a TPR that is at least 7$\times$ higher under a 1\% FPR and at least +22$\times$ higher under a 0.1\% FPR on CIFAR100 for a ResNet18 model. We +evaluated multiple defense mechanisms against OSLO. + +
+
+ comment: To appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Group Crosscoders for Mechanistic Analysis of Symmetry + + +
+ We introduce group crosscoders, an extension of crosscoders that +systematically discover and analyse symmetrical features in neural networks. +While neural networks often develop equivariant representations without +explicit architectural constraints, understanding these emergent symmetries has +traditionally relied on manual analysis. Group crosscoders automate this +process by performing dictionary learning across transformed versions of inputs +under a symmetry group. Applied to InceptionV1's mixed3b layer using the +dihedral group $\mathrm{D}_{32}$, our method reveals several key insights: +First, it naturally clusters features into interpretable families that +correspond to previously hypothesised feature types, providing more precise +separation than standard sparse autoencoders. Second, our transform block +analysis enables the automatic characterisation of feature symmetries, +revealing how different geometric features (such as curves versus lines) +exhibit distinct patterns of invariance and equivariance. These results +demonstrate that group crosscoders can provide systematic insights into how +neural networks represent symmetry, offering a promising new tool for +mechanistic interpretability. + +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Fine-Tuning in Large Models: A Survey of + Methodologies + + +
+ The large models, as predicted by scaling raw forecasts, have made +groundbreaking progress in many fields, particularly in natural language +generation tasks, where they have approached or even surpassed human levels. +However, the unprecedented scale of their parameters brings significant +computational and storage costs. These large models require substantial +computational resources and GPU memory to operate. When adapting large models +to specific downstream tasks, their massive parameter scale poses a significant +challenge in fine-tuning on hardware platforms with limited computational power +and GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT) +offers a practical solution by efficiently adjusting the parameters of large +pre-trained models to suit various downstream tasks. Specifically, PEFT adjusts +the parameters of pre-trained large models to adapt to specific tasks or +domains, minimizing the introduction of additional parameters and the +computational resources required. This review mainly introduces the preliminary +knowledge of PEFT, the core ideas and principles of various PEFT algorithms, +the applications of PEFT, and potential future research directions. By reading +this review, we believe that interested parties can quickly grasp the PEFT +methodology, thereby accelerating its development and innovation. + +
+
+
+
+
+ + ♻ ☆ Adversarial Federated Consensus Learning for Surface Defect + Classification Under Data Heterogeneity in IIoT + + +
+ The challenge of data scarcity hinders the application of deep learning in +industrial surface defect classification (SDC), as it's difficult to collect +and centralize sufficient training data from various entities in Industrial +Internet of Things (IIoT) due to privacy concerns. Federated learning (FL) +provides a solution by enabling collaborative global model training across +clients while maintaining privacy. However, performance may suffer due to data +heterogeneity-discrepancies in data distributions among clients. In this paper, +we propose a novel personalized FL (PFL) approach, named Adversarial Federated +Consensus Learning (AFedCL), for the challenge of data heterogeneity across +different clients in SDC. First, we develop a dynamic consensus construction +strategy to mitigate the performance degradation caused by data heterogeneity. +Through adversarial training, local models from different clients utilize the +global model as a bridge to achieve distribution alignment, alleviating the +problem of global knowledge forgetting. Complementing this strategy, we propose +a consensus-aware aggregation mechanism. It assigns aggregation weights to +different clients based on their efficacy in global knowledge learning, thereby +enhancing the global model's generalization capabilities. Finally, we design an +adaptive feature fusion module to further enhance global knowledge utilization +efficiency. Personalized fusion weights are gradually adjusted for each client +to optimally balance global and local features. Compared with state-of-the-art +FL methods like FedALA, the proposed AFedCL method achieves an accuracy +increase of up to 5.67% on three SDC datasets. + +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The dev dataset evaluation +results and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ LSEAttention is All You Need for Time Series Forecasting + + +
+ Transformer-based architectures have achieved remarkable success in natural +language processing and computer vision. However, their performance in +multivariate long-term forecasting often lags behind simpler linear baselines. +Previous studies have identified the traditional attention mechanism as a +significant factor contributing to this limitation. To unlock the full +potential of transformers for multivariate time series forecasting, I introduce +\textbf{LSEAttention}, an approach designed to address entropy collapse and +training instability commonly observed in transformer models. I validate the +effectiveness of LSEAttention across various real-world multivariate time +series datasets, demonstrating that it not only outperforms existing time +series transformer models but also exceeds the performance of some +state-of-the-art models on specific datasets. + +
+
+ comment: 7 pages with referencing, 1 figure, 3 tables +
+
+
+
+
+ + ♻ ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization in MLLMs, recent advances primarily focus on improving +the LLM components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector in MLLMs. Extensive ablation +experiments validate the effectiveness of introducing CMoE under any +configuration, with up to an average 8% performance gains. We further provide +interpretation analysis of the tug-of-war problem from the perspective of +gradient optimization and parameter statistics. Compared to previous +state-of-the-art medical MLLMs, Uni-Med achieves competitive or superior +evaluation metrics on diverse tasks. Code and resources are available at +https://github.com/tsinghua-msiip/Uni-Med. + +
+
+
+
+
+ + ♻ ☆ MoA: Mixture of Sparse Attention for Automatic Large Language Model + Compression + + +
+ Sparse attention can effectively mitigate the significant memory and +throughput demands of Large Language Models (LLMs) in long contexts. Existing +methods typically employ a uniform sparse attention mask, applying the same +sparse pattern across different attention heads and input lengths. However, +this uniform approach fails to capture the diverse attention patterns inherent +in LLMs, ignoring their distinct accuracy-latency trade-offs. To address this +challenge, we propose the Mixture of Attention (MoA), which automatically +tailors distinct sparse attention configurations to different heads and layers. +MoA constructs and navigates a search space of various attention patterns and +their scaling rules relative to input sequence lengths. It profiles the model, +evaluates potential configurations, and pinpoints the optimal sparse attention +compression plan. MoA adapts to varying input sizes, revealing that some +attention heads expand their focus to accommodate longer sequences, while other +heads consistently concentrate on fixed-length local contexts. Experiments show +that MoA increases the effective context length by $3.9\times$ with the same +average attention span, boosting retrieval accuracy by $1.5-7.1\times$ over the +uniform-attention baseline across Vicuna-{7B,13B}, and Llama3-{8B,70B} models. +Moreover, MoA narrows the capability gaps between sparse and dense models, +reducing the maximum relative performance drop from $9\%-36\%$ to within $5\%$ +across two long-context understanding benchmarks. MoA achieves a +$1.2-1.4\times$ GPU memory reduction, boosting decode throughput by +$6.6-8.2\times$ and $1.7-1.9\times$ compared to FlashAttention2 and vLLM, with +minimal impact on performance. Our code is available at +\url{https://github.com/thu-nics/MoA}. + +
+
+
+
+
+ + ♻ ☆ Detecting Brittle Decisions for Free: Leveraging Margin Consistency in + Deep Robust Classifiers + + +
+ Despite extensive research on adversarial training strategies to improve +robustness, the decisions of even the most robust deep learning models can +still be quite sensitive to imperceptible perturbations, creating serious risks +when deploying them for high-stakes real-world applications. While detecting +such cases may be critical, evaluating a model's vulnerability at a +per-instance level using adversarial attacks is computationally too intensive +and unsuitable for real-time deployment scenarios. The input space margin is +the exact score to detect non-robust samples and is intractable for deep neural +networks. This paper introduces the concept of margin consistency -- a property +that links the input space margins and the logit margins in robust models -- +for efficient detection of vulnerable samples. First, we establish that margin +consistency is a necessary and sufficient condition to use a model's logit +margin as a score for identifying non-robust samples. Next, through +comprehensive empirical analysis of various robustly trained models on CIFAR10 +and CIFAR100 datasets, we show that they indicate high margin consistency with +a strong correlation between their input space margins and the logit margins. +Then, we show that we can effectively and confidently use the logit margin to +detect brittle decisions with such models. Finally, we address cases where the +model is not sufficiently margin-consistent by learning a pseudo-margin from +the feature representation. Our findings highlight the potential of leveraging +deep representations to assess adversarial vulnerability in deployment +scenarios efficiently. + +
+
+ comment: 10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready +
+
+
+
+
+ + ♻ ☆ Optimizing Monotone Chance-Constrained Submodular Functions Using + Evolutionary Multi-Objective Algorithms + + +
+ Many real-world optimization problems can be stated in terms of submodular +functions. Furthermore, these real-world problems often involve uncertainties +which may lead to the violation of given constraints. A lot of evolutionary +multi-objective algorithms following the Pareto optimization approach have +recently been analyzed and applied to submodular problems with different types +of constraints. We present a first runtime analysis of evolutionary +multi-objective algorithms based on Pareto optimization for chance-constrained +submodular functions. Here the constraint involves stochastic components and +the constraint can only be violated with a small probability of alpha. We +investigate the classical GSEMO algorithm for two different bi-objective +formulations using tail bounds to determine the feasibility of solutions. We +show that the algorithm GSEMO obtains the same worst case performance +guarantees for monotone submodular functions as recently analyzed greedy +algorithms for the case of uniform IID weights and uniformly distributed +weights with the same dispersion when using the appropriate bi-objective +formulation. As part of our investigations, we also point out situations where +the use of tail bounds in the first bi-objective formulation can prevent GSEMO +from obtaining good solutions in the case of uniformly distributed weights with +the same dispersion if the objective function is submodular but non-monotone +due to a single element impacting monotonicity. Furthermore, we investigate the +behavior of the evolutionary multi-objective algorithms GSEMO, NSGA-II and +SPEA2 on different submodular chance-constrained network problems. Our +experimental results show that the use of evolutionary multi-objective +algorithms leads to significant performance improvements compared to +state-of-the-art greedy algorithms for submodular optimization. + +
+
+ comment: To appear in the Evolutionary Computation Journal 2024 +
+
+
+
+
+ + ♻ ☆ On the SDEs and Scaling Rules for Adaptive Gradient Algorithms + + +
+ Approximating Stochastic Gradient Descent (SGD) as a Stochastic Differential +Equation (SDE) has allowed researchers to enjoy the benefits of studying a +continuous optimization trajectory while carefully preserving the stochasticity +of SGD. Analogous study of adaptive gradient methods, such as RMSprop and Adam, +has been challenging because there were no rigorously proven SDE approximations +for these methods. This paper derives the SDE approximations for RMSprop and +Adam, giving theoretical guarantees of their correctness as well as +experimental validation of their applicability to common large-scaling vision +and language settings. A key practical result is the derivation of a +$\textit{square root scaling rule}$ to adjust the optimization hyperparameters +of RMSprop and Adam when changing batch size, and its empirical validation in +deep learning settings. + +
+
+ comment: revised for correcting errors in some figures +
+
+
+
+
+ + ♻ ☆ Robustness of graph embedding methods for community detection + + +
+ This study investigates the robustness of graph embedding methods for +community detection in the face of network perturbations, specifically edge +deletions. Graph embedding techniques, which represent nodes as low-dimensional +vectors, are widely used for various graph machine learning tasks due to their +ability to capture structural properties of networks effectively. However, the +impact of perturbations on the performance of these methods remains relatively +understudied. The research considers state-of-the-art graph embedding methods +from two families: matrix factorization (e.g., LE, LLE, HOPE, M-NMF) and random +walk-based (e.g., DeepWalk, LINE, node2vec). Through experiments conducted on +both synthetic and real-world networks, the study reveals varying degrees of +robustness within each family of graph embedding methods. The robustness is +found to be influenced by factors such as network size, initial community +partition strength, and the type of perturbation. Notably, node2vec and LLE +consistently demonstrate higher robustness for community detection across +different scenarios, including networks with degree and community size +heterogeneity. These findings highlight the importance of selecting an +appropriate graph embedding method based on the specific characteristics of the +network and the task at hand, particularly in scenarios where robustness to +perturbations is crucial. + +
+
+ comment: 17 pages, 26 figures, 3 tables. Comments are welcome +
+
+
+
+
+ + ♻ ☆ Discrete Dictionary-based Decomposition Layer for Structured + Representation Learning NeurIPS 2024 + + +
+ Neuro-symbolic neural networks have been extensively studied to integrate +symbolic operations with neural networks, thereby improving systematic +generalization. Specifically, Tensor Product Representation (TPR) framework +enables neural networks to perform differentiable symbolic operations by +encoding the symbolic structure of data within vector spaces. However, +TPR-based neural networks often struggle to decompose unseen data into +structured TPR representations, undermining their symbolic operations. To +address this decomposition problem, we propose a Discrete Dictionary-based +Decomposition (D3) layer designed to enhance the decomposition capabilities of +TPR-based models. D3 employs discrete, learnable key-value dictionaries trained +to capture symbolic features essential for decomposition operations. It +leverages the prior knowledge acquired during training to generate structured +TPR representations by mapping input data to pre-learned symbolic features +within these dictionaries. D3 is a straightforward drop-in layer that can be +seamlessly integrated into any TPR-based model without modifications. Our +experimental results demonstrate that D3 significantly improves the systematic +generalization of various TPR-based models while requiring fewer additional +parameters. Notably, D3 outperforms baseline models on the synthetic task that +demands the systematic decomposition of unseen combinatorial data. + +
+
+ comment: Published in NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On the Role of Attention Masks and LayerNorm in Transformers NeurIPS 2024 + + +
+ Self-attention is the key mechanism of transformers, which are the essential +building blocks of modern foundation models. Recent studies have shown that +pure self-attention suffers from an increasing degree of rank collapse as depth +increases, limiting model expressivity and further utilization of model depth. +The existing literature on rank collapse, however, has mostly overlooked other +critical components in transformers that may alleviate the rank collapse issue. +In this paper, we provide a general analysis of rank collapse under +self-attention, taking into account the effects of attention masks and layer +normalization (LayerNorm). In particular, we find that although pure masked +attention still suffers from exponential collapse to a rank one subspace, +sparse or local masked attention can provably slow down the collapse rate. In +the case of self-attention with LayerNorm, we first show that for certain +classes of value matrices, collapse to a rank one subspace still happens +exponentially. However, through construction of nontrivial counterexamples, we +then establish that with proper choice of value matrices, a general class of +sequences may not converge to a rank one subspace, and the self-attention +dynamics with LayerNorm can simultaneously possess a rich set of equilibria +with any possible rank between one and full. Our result refutes the previous +hypothesis that LayerNorm plays no role in the rank collapse of self-attention +and suggests that self-attention with LayerNorm constitutes a much more +expressive, versatile nonlinear dynamical system than what was originally +thought. + +
+
+ comment: NeurIPS 2024. Fixed errors in v1 and added new remarks +
+
+
+
+
+ + ♻ ☆ A Framework for Real-Time Volcano-Seismic Event Recognition Based on + Multi-Station Seismograms and Semantic Segmentation Models + + +
+ In volcano monitoring, effective recognition of seismic events is essential +for understanding volcanic activity and raising timely warning alerts. +Traditional methods rely on manual analysis, which can be subjective and +labor-intensive. Furthermore, current automatic approaches often tackle +detection and classification separately, mostly rely on single station +information and generally require tailored preprocessing and representations to +perform predictions. These limitations often hinder their application to +real-time monitoring and utilization across different volcano conditions. This +study introduces a novel approach that utilizes Semantic Segmentation models to +automate seismic event recognition by applying a straight forward +transformation of multi-channel 1D signals into 2D representations, enabling +their use as images. Our framework employs a data-driven, end-to-end design +that integrates multi-station seismic data with minimal preprocessing, +performing both detection and classification simultaneously for five seismic +event classes. We evaluated four state-of-the-art segmentation models (UNet, +UNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events +recorded at four different Chilean volcanoes: Nevados del Chill\'an Volcanic +Complex, Laguna del Maule, Villarrica and Puyehue-Cord\'on Caulle. Among these +models, the UNet architecture was identified as the most effective model, +achieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and +0.88, respectively, and demonstrating superior noise robustness and model +flexibility to unseen volcano datasets. + +
+
+ comment: 10 pages, 9 figures. This is a pre-print, it is currently under + review for publication +
+
+
+
+
+ + ♻ ☆ Decision-Making Behavior Evaluation Framework for LLMs under Uncertain + Context + + +
+ When making decisions under uncertainty, individuals often deviate from +rational behavior, which can be evaluated across three dimensions: risk +preference, probability weighting, and loss aversion. Given the widespread use +of large language models (LLMs) in decision-making processes, it is crucial to +assess whether their behavior aligns with human norms and ethical expectations +or exhibits potential biases. Several empirical studies have investigated the +rationality and social behavior performance of LLMs, yet their internal +decision-making tendencies and capabilities remain inadequately understood. +This paper proposes a framework, grounded in behavioral economics, to evaluate +the decision-making behaviors of LLMs. Through a multiple-choice-list +experiment, we estimate the degree of risk preference, probability weighting, +and loss aversion in a context-free setting for three commercial LLMs: +ChatGPT-4.0-Turbo, Claude-3-Opus, and Gemini-1.0-pro. Our results reveal that +LLMs generally exhibit patterns similar to humans, such as risk aversion and +loss aversion, with a tendency to overweight small probabilities. However, +there are significant variations in the degree to which these behaviors are +expressed across different LLMs. We also explore their behavior when embedded +with socio-demographic features, uncovering significant disparities. For +instance, when modeled with attributes of sexual minority groups or physical +disabilities, Claude-3-Opus displays increased risk aversion, leading to more +conservative choices. These findings underscore the need for careful +consideration of the ethical implications and potential biases in deploying +LLMs in decision-making scenarios. Therefore, this study advocates for +developing standards and guidelines to ensure that LLMs operate within ethical +boundaries while enhancing their utility in complex decision-making +environments. + +
+
+ comment: Jingru Jia and Zehua Yuan have equal contribution +
+
+
+
+
+ + ♻ ☆ LRM-Zero: Training Large Reconstruction Models with Synthesized Data NeurIPS 2024 + + +
+ We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on +synthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The +core of LRM-Zero is our procedural 3D dataset, Zeroverse, which is +automatically synthesized from simple primitive shapes with random texturing +and augmentations (e.g., height fields, boolean differences, and wireframes). +Unlike previous 3D datasets (e.g., Objaverse) which are often captured or +crafted by humans to approximate real 3D data, Zeroverse completely ignores +realistic global semantics but is rich in complex geometric and texture details +that are locally similar to or even more intricate than real objects. We +demonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse, +can achieve high visual quality in the reconstruction of real-world objects, +competitive with models trained on Objaverse. We also analyze several critical +design choices of Zeroverse that contribute to LRM-Zero's capability and +training stability. Our work demonstrates that 3D reconstruction, one of the +core tasks in 3D vision, can potentially be addressed without the semantics of +real-world objects. The Zeroverse's procedural synthesis code and interactive +visualization are available at: https://desaixie.github.io/lrm-zero/. + +
+
+ comment: 23 pages, 8 figures. Our code and interactive visualization are + available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera + Ready version +
+
+
+
+
+ + ♻ ☆ DiffusionPDE: Generative PDE-Solving Under Partial Observation NeurIPS 2024 + + +
+ We introduce a general framework for solving partial differential equations +(PDEs) using generative diffusion models. In particular, we focus on the +scenarios where we do not have the full knowledge of the scene necessary to +apply classical solvers. Most existing forward or inverse PDE approaches +perform poorly when the observations on the data or the underlying coefficients +are incomplete, which is a common assumption for real-world measurements. In +this work, we propose DiffusionPDE that can simultaneously fill in the missing +information and solve a PDE by modeling the joint distribution of the solution +and coefficient spaces. We show that the learned generative priors lead to a +versatile framework for accurately solving a wide range of PDEs under partial +observation, significantly outperforming the state-of-the-art methods for both +forward and inverse directions. + +
+
+ comment: NeurIPS 2024. Project page: + https://jhhuangchloe.github.io/Diffusion-PDE/ +
+
+
+
+
+ + ♻ ☆ Trace is the Next AutoDiff: Generative Optimization with Rich Feedback, + Execution Traces, and LLMs + + +
+ We study a class of optimization problems motivated by automating the design +and update of AI systems like coding assistants, robots, and copilots. AutoDiff +frameworks, like PyTorch, enable efficient end-to-end optimization of +differentiable systems. However, general computational workflows can be +non-differentiable and involve rich feedback (e.g. console output or user's +responses), heterogeneous parameters (e.g. prompts, codes), and intricate +objectives (beyond maximizing a score). We investigate end-to-end generative +optimization -- using generative models such as LLMs within the optimizer for +automatic updating of general computational workflows. We discover that +workflow execution traces are akin to back-propagated gradients in AutoDiff and +can provide key information to interpret feedback for efficient optimization. +Formally, we frame a new mathematical setup, Optimization with Trace Oracle +(OPTO). In OPTO, an optimizer receives an execution trace along with feedback +on the computed output and updates parameters iteratively. We provide a Python +library, Trace, that efficiently converts a workflow optimization problem into +an OPTO instance using PyTorch-like syntax. Using Trace, we develop a general +LLM-based generative optimizer called OptoPrime. In empirical studies, we find +that OptoPrime is capable of first-order numerical optimization, prompt +optimization, hyper-parameter tuning, robot controller design, code debugging, +etc., and is often competitive with specialized optimizers for each domain. We +envision Trace as an open research platform for devising novel generative +optimizers and developing the next generation of interactive learning agents. +Website: https://microsoft.github.io/Trace/. + +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ♻ ☆ Towards Robust Multimodal Sentiment Analysis with Incomplete Data NeurIPS 2024 + + +
+ The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an +emerging direction seeking to tackle the issue of data incompleteness. +Recognizing that the language modality typically contains dense sentiment +information, we consider it as the dominant modality and present an innovative +Language-dominated Noise-resistant Learning Network (LNLN) to achieve robust +MSA. The proposed LNLN features a dominant modality correction (DMC) module and +dominant modality based multimodal learning (DMML) module, which enhances the +model's robustness across various noise scenarios by ensuring the quality of +dominant modality representations. Aside from the methodical design, we perform +comprehensive experiments under random data missing scenarios, utilizing +diverse and meaningful settings on several popular datasets (\textit{e.g.,} +MOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and +fairness compared to existing evaluations in the literature. Empirically, LNLN +consistently outperforms existing baselines, demonstrating superior performance +across these challenging and extensive evaluation metrics. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 99 + +
+
+
+ + ☆ Teaching Embodied Reinforcement Learning Agents: Informativeness and + Diversity of Language Use EMNLP 2024 + + +
+ In real-world scenarios, it is desirable for embodied agents to have the +ability to leverage human language to gain explicit or implicit knowledge for +learning tasks. Despite recent progress, most previous approaches adopt simple +low-level instructions as language inputs, which may not reflect natural human +communication. It's not clear how to incorporate rich language use to +facilitate task learning. To address this question, this paper studies +different types of language inputs in facilitating reinforcement learning (RL) +embodied agents. More specifically, we examine how different levels of language +informativeness (i.e., feedback on past behaviors and future guidance) and +diversity (i.e., variation of language expressions) impact agent learning and +inference. Our empirical results based on four RL benchmarks demonstrate that +agents trained with diverse and informative language feedback can achieve +enhanced generalization and fast adaptation to new tasks. These findings +highlight the pivotal role of language use in teaching embodied agents new +tasks in an open world. Project website: +https://github.com/sled-group/Teachable_RL + +
+
+ comment: EMNLP 2024 Main. Project website: + https://github.com/sled-group/Teachable_RL +
+
+
+
+
+ + ☆ P-Masking: Power Law Masking Improves Multi-attribute Controlled + Generation + + +
+ We introduce LingGen, a novel approach for controlled text generation that +offers precise control over a wide array of linguistic attributes, even as the +number of attributes varies. LingGen employs a dynamic P-MASKING strategy, +which samples masking rates from a power law distribution during training. This +innovative approach enables the model to develop robust representations and +adapt its attribute control capabilities across a variable number of +attributes, from a single attribute to multiple complex configurations. The +P-MASKING technique enhances LingGen's ability to manage different levels of +attribute visibility, resulting in superior performance in multi-attribute +generation tasks. Our experiments demonstrate that LingGen surpasses current +state-of-the-art models in both attribute control accuracy and text fluency, +particularly excelling in scenarios with varying attribute demands. +Additionally, our ablation studies highlight the effectiveness of P-MASKING and +the influence of different base language models on performance. These findings +demonstrate LingGen's potential for applications requiring precise and +adaptable control over multiple linguistic attributes in text generation. + +
+
+
+
+
+ + ☆ Length-Induced Embedding Collapse in Transformer-based Models + + +
+ Text embeddings enable various applications, but their performance +deteriorates on longer texts. In this paper, we find that the performance +degradation is due to a phenomenon called Length Collapse, where longer text +embeddings collapse into a narrow space. This collapse results in a +distributional inconsistency between embeddings of different text lengths, +ultimately hurting the performance of downstream tasks. Theoretically, by +considering the self-attention mechanism inherently functions as a low-pass +filter, we prove that long sequences increase the attenuation rate of the +low-pass filter effect of the self-attention mechanism. With layers going +deeper, excessive low-pass filtering causes the token signals to retain only +their Direct-Current (DC) component, which means the input token feature maps +will collapse into a narrow space, especially in long texts. Based on the above +analysis, we propose to mitigate the undesirable length collapse limitation by +introducing a temperature in softmax(), which achieves a higher low-filter +attenuation rate. The tuning-free method, called TempScale, can be plugged into +multiple transformer-based embedding models. Empirically, we demonstrate that +TempScale can improve existing embedding models, especially on long text +inputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text +Embedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from +LongEmbed, which specifically focuses on long context retrieval. + +
+
+
+
+
+ + ☆ Multi-Attribute Linguistic Tuning for Controlled Paraphrase Generation + + +
+ We present a novel approach to paraphrase generation that enables precise +control and fine-tuning of 40 linguistic attributes for English. Our model is +an encoder-decoder architecture that takes as input a source sentence and +desired linguistic attributes, and produces paraphrases of the source that +satisfy the desired attributes. To guarantee high-quality outputs at inference +time, our method is equipped with a quality control mechanism that gradually +adjusts the embedding of linguistic attributes to find the nearest and most +attainable configuration of desired attributes for paraphrase generation. We +evaluate the effectiveness of our method by comparing it to recent controllable +generation models. Experimental results demonstrate that the proposed model +outperforms baselines in generating paraphrases that satisfy desired linguistic +attributes. + +
+
+
+
+
+ + ☆ Hidden Persuaders: LLMs' Political Leaning and Their Influence on Voters EMNLP 2024 + + +
+ How could LLMs influence our democracy? We investigate LLMs' political +leanings and the potential influence of LLMs on voters by conducting multiple +experiments in a U.S. presidential election context. Through a voting +simulation, we first demonstrate 18 open- and closed-weight LLMs' political +preference for a Democratic nominee over a Republican nominee. We show how this +leaning towards the Democratic nominee becomes more pronounced in +instruction-tuned models compared to their base versions by analyzing their +responses to candidate-policy related questions. We further explore the +potential impact of LLMs on voter choice by conducting an experiment with 935 +U.S. registered voters. During the experiments, participants interacted with +LLMs (Claude-3, Llama-3, and GPT-4) over five exchanges. The experiment results +show a shift in voter choices towards the Democratic nominee following LLM +interaction, widening the voting margin from 0.7% to 4.6%, even though LLMs +were not asked to persuade users to support the Democratic nominee during the +discourse. This effect is larger than many previous studies on the +persuasiveness of political campaigns, which have shown minimal effects in +presidential elections. Many users also expressed a desire for further +political interaction with LLMs. Which aspects of LLM interactions drove these +shifts in voter choice requires further study. Lastly, we explore how a safety +method can make LLMs more politically neutral, while leaving some open +questions. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ☆ DC-Spin: A Speaker-invariant Speech Tokenizer for Spoken Language Models + + +
+ Spoken language models (SLMs) have gained increasing attention with +advancements in text-based, decoder-only language models. SLMs process text and +speech, enabling simultaneous speech understanding and generation. This paper +presents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to +improve speech tokenization by bridging audio signals and SLM tokens. DC-Spin +extracts speaker-invariant tokens rich in phonetic information and resilient to +input variations, enhancing zero-shot SLM tasks and speech resynthesis. We +propose a chunk-wise approach to enable streamable DC-Spin without retraining +and degradation. Comparisons of tokenization methods (self-supervised and +neural audio codecs), model scalability, and downstream task proxies show that +tokens easily modeled by an n-gram LM or aligned with phonemes offer strong +performance, providing insights for designing speech tokenizers for SLMs. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Constraint Back-translation Improves Complex Instruction Following of + Large Language Models + + +
+ Large language models (LLMs) struggle to follow instructions with complex +constraints in format, length, etc. Following the conventional +instruction-tuning practice, previous works conduct post-training on complex +instruction-response pairs generated by feeding complex instructions to +advanced LLMs. However, even advanced LLMs cannot follow complex instructions +well, thus limiting the quality of generated data. In this work, we find that +existing datasets inherently contain implicit complex constraints and propose a +novel data generation technique, constraint back-translation. Specifically, we +take the high-quality instruction-response pairs in existing datasets and only +adopt advanced LLMs to add complex constraints already met by the responses to +the instructions, which naturally reduces costs and data noise. In the +experiments, we adopt Llama3-70B-Instruct to back-translate constraints and +create a high-quality complex instruction-response dataset, named CRAB. We +present that post-training on CRAB improves multiple backbone LLMs' complex +instruction-following ability, evaluated on extensive instruction-following +benchmarks. We further find that constraint back-translation also serves as a +useful auxiliary training objective in post-training. Our code, data, and +models will be released to facilitate future research. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Novel Architecture for Distributed Travel Data Integration and Service + Provision Using Microservices + + +
+ This paper introduces a microservices architecture for the purpose of +enhancing the flexibility and performance of an airline reservation system. The +architectural design incorporates Redis cache technologies, two different +messaging systems (Kafka and RabbitMQ), two types of storages (MongoDB, and +PostgreSQL). It also introduces authorization techniques, including secure +communication through OAuth2 and JWT which is essential with the management of +high-demand travel services. According to selected indicators, the architecture +provides an impressive level of data consistency at 99.5% and a latency of data +propagation of less than 75 ms allowing rapid and reliable intercommunication +between microservices. A system throughput of 1050 events per second was +achieved so that the acceptability level was maintained even during peak time. +Redis caching reduced a 92% cache hit ratio on the database thereby lowering +the burden on the database and increasing the speed of response. Further +improvement of the systems scalability was done through the use of Docker and +Kubernetes which enabled services to be expanded horizontally to cope with the +changes in demand. The error rates were very low, at 0.2% further enhancing the +efficiency of the system in handling real-time data integration. This approach +is suggested to meet the specific needs of the airline reservation system. It +is secure, fast, scalable, all serving to improve the user experience as well +as the efficiency of operations. The low latency and high data integration +levels and prevaiing efficient usage of the resources demonstrates the +architecture ability to offer continued support in the ever growing high demand +situations. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ Redefining in Dictionary: Towards a Enhanced Semantic + Understanding of Creative Generation + + +
+ Creativity, both in human and diffusion models, remains an inherently +abstract concept; thus, simply adding "creative" to a prompt does not yield +reliable semantic recognition by the model. In this work, we concretize the +abstract notion of "creative" through the TP2O task, which aims to merge two +unrelated concepts, and introduce CreTok, redefining "creative" as the token +$\texttt{}$. This redefinition offers a more concrete and universally +adaptable representation for concept blending. This redefinition occurs +continuously, involving the repeated random sampling of text pairs with +different concepts and optimizing cosine similarity between target and constant +prompts. This approach enables $\texttt{}$ to learn a method for +creative concept fusion. Extensive experiments demonstrate that the creative +capability enabled by $\texttt{}$ substantially surpasses recent SOTA +diffusion models and achieves superior creative generation. CreTok exhibits +greater flexibility and reduced time overhead, as $\texttt{}$ can +function as a universal token for any concept, facilitating creative generation +without retraining. + +
+
+
+
+
+ + ☆ GPT or BERT: why not both? + + +
+ We present a simple way to merge masked language modeling with causal +language modeling. This hybrid training objective results in a model that +combines the strengths of both modeling paradigms within a single transformer +stack: GPT-BERT can be transparently used like any standard causal or masked +language model. We test the pretraining process that enables this flexible +behavior on the BabyLM Challenge 2024. The results show that the hybrid +pretraining outperforms masked-only or causal-only models. We openly release +the models, training corpora and code. + +
+
+ comment: 22 pages; submission to the BabyLM Challenge 2024 +
+
+
+
+
+ + ☆ Thought Space Explorer: Navigating and Expanding Thought Space for Large + Language Model Reasoning + + +
+ Recent advances in large language models (LLMs) have demonstrated their +potential in handling complex reasoning tasks, which are usually achieved by +constructing a thought chain to guide the model to solve the problem with +multi-step thinking. However, existing methods often remain confined to +previously explored solution spaces and thus overlook the critical blind spot +within LLMs' cognitive range. To address these issues, we design the Thought +Space Explorer (TSE), a novel framework to expand and optimize thought +structures to guide LLMs to explore their blind spots of thinking. By +generating new reasoning steps and branches based on the original thought +structure with various designed strategies, TSE broadens the thought space and +alleviates the impact of blind spots for LLM reasoning. Experimental results on +multiple levels of reasoning tasks demonstrate the efficacy of TSE. We also +conduct extensive analysis to understand how structured and expansive thought +can contribute to unleashing the potential of LLM reasoning capabilities. + +
+
+
+
+
+ + ☆ Scaling Concept With Text-Guided Diffusion Models + + +
+ Text-guided diffusion models have revolutionized generative tasks by +producing high-fidelity content from text descriptions. They have also enabled +an editing paradigm where concepts can be replaced through text conditioning +(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of +replacing a concept, can we enhance or suppress the concept itself? Through an +empirical study, we identify a trend where concepts can be decomposed in +text-guided diffusion models. Leveraging this insight, we introduce +ScalingConcept, a simple yet effective method to scale decomposed concepts up +or down in real input without introducing new elements. To systematically +evaluate our approach, we present the WeakConcept-10 dataset, where concepts +are imperfect and need to be enhanced. More importantly, ScalingConcept enables +a variety of novel zero-shot applications across image and audio domains, +including tasks such as canonical pose generation and generative sound +highlighting or removal. + +
+
+ comment: Project page: https://wikichao.github.io/ScalingConcept/ +
+
+
+
+
+ + ☆ Don't Touch My Diacritics + + +
+ The common practice of preprocessing text before feeding it into NLP models +introduces many decision points which have unintended consequences on model +performance. In this opinion piece, we focus on the handling of diacritics in +texts originating in many languages and scripts. We demonstrate, through +several case studies, the adverse effects of inconsistent encoding of +diacritized characters and of removing diacritics altogether. We call on the +community to adopt simple but necessary steps across all models and toolkits in +order to improve handling of diacritized text and, by extension, increase +equity in multilingual NLP. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Nearest Neighbor Normalization Improves Multimodal Retrieval + + +
+ Multimodal models leverage large-scale pre-training to achieve strong but +still imperfect performance on tasks such as image captioning, visual question +answering, and cross-modal retrieval. In this paper, we present a simple and +efficient method for correcting errors in trained contrastive image-text +retrieval models with no additional training, called Nearest Neighbor +Normalization (NNN). We show an improvement on retrieval metrics in both text +retrieval and image retrieval for all of the contrastive models that we tested +(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used +(MS-COCO and Flickr30k). NNN requires a reference database, but does not +require any training on this database, and can even increase the retrieval +accuracy of a model after finetuning. + +
+
+
+
+
+ + ☆ In-Context Fine-Tuning for Time-Series Foundation Models + + +
+ Motivated by the recent success of time-series foundation models for +zero-shot forecasting, we present a methodology for $\textit{in-context +fine-tuning}$ of a time-series foundation model. In particular, we design a +pretrained foundation model that can be prompted (at inference time) with +multiple time-series examples, in order to forecast a target time-series into +the future. Our foundation model is specifically trained to utilize examples +from multiple related time-series in its context window (in addition to the +history of the target time-series) to help it adapt to the specific +distribution of the target domain at inference time. We show that such a +foundation model that uses in-context examples at inference time can obtain +much better performance on popular forecasting benchmarks compared to +supervised deep learning methods, statistical models, as well as other +time-series foundation models. Interestingly, our in-context fine-tuning +approach even rivals the performance of a foundation model that is explicitly +fine-tuned on the target domain. + +
+
+
+
+
+ + ☆ Desert Camels and Oil Sheikhs: Arab-Centric Red Teaming of Frontier LLMs + + +
+ Large language models (LLMs) are widely used but raise ethical concerns due +to embedded social biases. This study examines LLM biases against Arabs versus +Westerners across eight domains, including women's rights, terrorism, and +anti-Semitism and assesses model resistance to perpetuating these biases. To +this end, we create two datasets: one to evaluate LLM bias toward Arabs versus +Westerners and another to test model safety against prompts that exaggerate +negative traits ("jailbreaks"). We evaluate six LLMs -- GPT-4, GPT-4o, LlaMA +3.1 (8B & 405B), Mistral 7B, and Claude 3.5 Sonnet. We find 79% of cases +displaying negative biases toward Arabs, with LlaMA 3.1-405B being the most +biased. Our jailbreak tests reveal GPT-4o as the most vulnerable, despite being +an optimized version, followed by LlaMA 3.1-8B and Mistral 7B. All LLMs except +Claude exhibit attack success rates above 87% in three categories. We also find +Claude 3.5 Sonnet the safest, but it still displays biases in seven of eight +categories. Despite being an optimized version of GPT4, We find GPT-4o to be +more prone to biases and jailbreaks, suggesting optimization flaws. Our +findings underscore the pressing need for more robust bias mitigation +strategies and strengthened security measures in LLMs. + +
+
+
+
+
+ + ☆ Navigating the Unknown: A Chat-Based Collaborative Interface for + Personalized Exploratory Tasks + + +
+ The rise of large language models (LLMs) has revolutionized user interactions +with knowledge-based systems, enabling chatbots to synthesize vast amounts of +information and assist with complex, exploratory tasks. However, LLM-based +chatbots often struggle to provide personalized support, particularly when +users start with vague queries or lack sufficient contextual information. This +paper introduces the Collaborative Assistant for Personalized Exploration +(CARE), a system designed to enhance personalization in exploratory tasks by +combining a multi-agent LLM framework with a structured user interface. CARE's +interface consists of a Chat Panel, Solution Panel, and Needs Panel, enabling +iterative query refinement and dynamic solution generation. The multi-agent +framework collaborates to identify both explicit and implicit user needs, +delivering tailored, actionable solutions. In a within-subject user study with +22 participants, CARE was consistently preferred over a baseline LLM chatbot, +with users praising its ability to reduce cognitive load, inspire creativity, +and provide more tailored solutions. Our findings highlight CARE's potential to +transform LLM-based systems from passive information retrievers to proactive +partners in personalized problem-solving and exploration. + +
+
+
+
+
+ + ☆ Joint Training for Selective Prediction + + +
+ Classifier models are prevalent in natural language processing (NLP), often +with high accuracy. Yet in real world settings, human-in-the-loop systems can +foster trust in model outputs and even higher performance. Selective Prediction +(SP) methods determine when to adopt a classifier's output versus defer to a +human. Previous SP approaches have addressed how to improve softmax as a +measure of model confidence, or have developed separate confidence estimators. +One previous method involves learning a deferral model based on engineered +features. We introduce a novel joint-training approach that simultaneously +optimizes learned representations used by the classifier module and a learned +deferral policy. Our results on four classification tasks demonstrate that +joint training not only leads to better SP outcomes over two strong baselines, +but also improves the performance of both modules. + +
+
+
+
+
+ + ☆ SFM-Protein: Integrative Co-evolutionary Pre-training for Advanced + Protein Sequence Representation + + +
+ Proteins, essential to biological systems, perform functions intricately +linked to their three-dimensional structures. Understanding the relationship +between protein structures and their amino acid sequences remains a core +challenge in protein modeling. While traditional protein foundation models +benefit from pre-training on vast unlabeled datasets, they often struggle to +capture critical co-evolutionary information, which evolutionary-based methods +excel at. In this study, we introduce a novel pre-training strategy for protein +foundation models that emphasizes the interactions among amino acid residues to +enhance the extraction of both short-range and long-range co-evolutionary +features from sequence data. Trained on a large-scale protein sequence dataset, +our model demonstrates superior generalization ability, outperforming +established baselines of similar size, including the ESM model, across diverse +downstream tasks. Experimental results confirm the model's effectiveness in +integrating co-evolutionary information, marking a significant step forward in +protein sequence-based modeling. + +
+
+
+
+
+ + ☆ Detecting text level intellectual influence with knowledge graph + embeddings + + +
+ Introduction: Tracing the spread of ideas and the presence of influence is a +question of special importance across a wide range of disciplines, ranging from +intellectual history to cultural analytics, computational social science, and +the science of science. + Method: We collect a corpus of open source journal articles, generate +Knowledge Graph representations using the Gemini LLM, and attempt to predict +the existence of citations between sampled pairs of articles using previously +published methods and a novel Graph Neural Network based embedding model. + Results: We demonstrate that our knowledge graph embedding method is superior +at distinguishing pairs of articles with and without citation. Once trained, it +runs efficiently and can be fine-tuned on specific corpora to suit individual +researcher needs. + Conclusion(s): This experiment demonstrates that the relationships encoded in +a knowledge graph, especially the types of concepts brought together by +specific relations can encode information capable of revealing intellectual +influence. This suggests that further work in analyzing document level +knowledge graphs to understand latent structures could provide valuable +insights. + +
+
+
+
+
+ + ☆ Speech is More Than Words: Do Speech-to-Text Translation Systems + Leverage Prosody? + + +
+ The prosody of a spoken utterance, including features like stress, intonation +and rhythm, can significantly affect the underlying semantics, and as a +consequence can also affect its textual translation. Nevertheless, prosody is +rarely studied within the context of speech-to-text translation (S2TT) systems. +In particular, end-to-end (E2E) systems have been proposed as well-suited for +prosody-aware translation because they have direct access to the speech signal +when making translation decisions, but the understanding of whether this is +successful in practice is still limited. A main challenge is the difficulty of +evaluating prosody awareness in translation. To address this challenge, we +introduce an evaluation methodology and a focused benchmark (named ContraProST) +aimed at capturing a wide range of prosodic phenomena. Our methodology uses +large language models and controllable text-to-speech (TTS) to generate +contrastive examples. Through experiments in translating English speech into +German, Spanish, and Japanese, we find that (a) S2TT models possess some +internal representation of prosody, but the prosody signal is often not strong +enough to affect the translations, (b) E2E systems outperform cascades of +speech recognition and text translation systems, confirming their theoretical +advantage in this regard, and (c) certain cascaded systems also capture +prosodic information in the translation, but only to a lesser extent that +depends on the particulars of the transcript's surface form. + +
+
+ comment: WMT 2024 +
+
+
+
+
+ + ☆ Multilingual Pretraining Using a Large Corpus Machine-Translated from a + Single Source Language + + +
+ English, as a very high-resource language, enables the pretraining of +high-quality large language models (LLMs). The same cannot be said for most +other languages, as leading LLMs still underperform for non-English languages, +likely due to a gap in the quality and diversity of the available multilingual +pretraining corpora. In this work, we find that machine-translated text from a +single high-quality source language can contribute significantly to the +pretraining of multilingual LLMs. We translate FineWeb-Edu, a high-quality +English web dataset, into French, German, and Spanish, resulting in a final +300B-token dataset, which we call TransWeb-Edu, and pretrain a 1.3B-parameter +model, CuatroLLM, from scratch on this dataset. Across five non-English +reasoning tasks, we show that CuatroLLM matches or outperforms state-of-the-art +multilingual models trained using closed data, such as Llama3.2 and Gemma2, +despite using an order of magnitude less data, such as about 6% of the tokens +used for Llama3.2's training. We further demonstrate that with additional +domain-specific pretraining, amounting to less than 1% of TransWeb-Edu, +CuatroLLM surpasses the state of the art in multilingual reasoning. To promote +reproducibility, we release our corpus, models, and training pipeline under +open licenses at hf.co/britllm/CuatroLLM. + +
+
+
+
+
+ + ☆ Representative Social Choice: From Learning Theory to AI Alignment NeurIPS 2024 + + +
+ Social choice theory is the study of preference aggregation across a +population, used both in mechanism design for human agents and in the +democratic alignment of language models. In this study, we propose the +representative social choice framework for the modeling of democratic +representation in collective decisions, where the number of issues and +individuals are too large for mechanisms to consider all preferences directly. +These scenarios are widespread in real-world decision-making processes, such as +jury trials, indirect elections, legislation processes, corporate governance, +and, more recently, language model alignment. In representative social choice, +the population is represented by a finite sample of individual-issue pairs +based on which social choice decisions are made. We show that many of the +deepest questions in representative social choice can be naturally formulated +as statistical learning problems, and prove the generalization properties of +social choice mechanisms using the theory of machine learning. We further +formulate axioms for representative social choice, and prove Arrow-like +impossibility theorems with new combinatorial tools of analysis. Our framework +introduces the representative approach to social choice, opening up research +directions at the intersection of social choice, learning theory, and AI +alignment. + +
+
+ comment: Full version (20 pages). Under review. An excerpt was previously + accepted to NeurIPS 2024 Pluralistic Alignment Workshop +
+
+
+
+
+ + ☆ Language Models can Self-Lengthen to Generate Long Texts + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +enhanced their ability to process long contexts, yet a notable gap remains in +generating long, aligned outputs. This limitation stems from a training gap +where pre-training lacks effective instructions for long-text generation, and +post-training data primarily consists of short query-response pairs. Current +approaches, such as instruction backtranslation and behavior imitation, face +challenges including data quality, copyright issues, and constraints on +proprietary model usage. In this paper, we introduce an innovative iterative +training framework called Self-Lengthen that leverages only the intrinsic +knowledge and skills of LLMs without the need for auxiliary data or proprietary +models. The framework consists of two roles: the Generator and the Extender. +The Generator produces the initial response, which is then split and expanded +by the Extender. This process results in a new, longer response, which is used +to train both the Generator and the Extender iteratively. Through this process, +the models are progressively trained to handle increasingly longer responses. +Experiments on benchmarks and human evaluations show that Self-Lengthen +outperforms existing methods in long-text generation, when applied to top +open-source LLMs such as Qwen2 and LLaMA3. Our code is publicly available at +https://github.com/QwenLM/Self-Lengthen. + +
+
+
+
+
+ + ☆ BitStack: Fine-Grained Size Control for Compressed Large Language Models + in Variable Memory Environments + + +
+ Large language models (LLMs) have revolutionized numerous applications, yet +their deployment remains challenged by memory constraints on local devices. +While scaling laws have enhanced LLM capabilities, the primary bottleneck has +shifted from \textit{capability} to \textit{availability}, emphasizing the need +for efficient memory management. Traditional compression methods, such as +quantization, often require predefined compression ratios and separate +compression processes for each setting, complicating deployment in variable +memory environments. In this paper, we introduce \textbf{BitStack}, a novel, +training-free weight compression approach that enables megabyte-level +trade-offs between memory usage and model performance. By leveraging weight +decomposition, BitStack can dynamically adjust the model size with minimal +transmission between running memory and storage devices. Our approach +iteratively decomposes weight matrices while considering the significance of +each parameter, resulting in an approximately 1-bit per parameter residual +block in each decomposition iteration. These blocks are sorted and stacked in +storage as basic transmission units, with different quantities loaded based on +current memory availability. Extensive experiments across a wide range of tasks +demonstrate that, despite offering fine-grained size control, BitStack +consistently matches or surpasses strong quantization baselines, particularly +at extreme compression ratios. To the best of our knowledge, this is the first +decomposition-based method that effectively bridges the gap to practical +compression techniques like quantization. Code is available at +https://github.com/xinghaow99/BitStack. + +
+
+
+
+
+ + ☆ Responsible Retrieval Augmented Generation for Climate Decision Making + from Documents + + +
+ Climate decision making is constrained by the complexity and inaccessibility +of key information within lengthy, technical, and multi-lingual documents. +Generative AI technologies offer a promising route for improving the +accessibility of information contained within these documents, but suffer from +limitations. These include (1) a tendency to hallucinate or mis-represent +information, (2) difficulty in steering or guaranteeing properties of generated +output, and (3) reduced performance in specific technical domains. To address +these challenges, we introduce a novel evaluation framework with +domain-specific dimensions tailored for climate-related documents. We then +apply this framework to evaluate Retrieval-Augmented Generation (RAG) +approaches and assess retrieval- and generation-quality within a prototype tool +that answers questions about individual climate law and policy documents. In +addition, we publish a human-annotated dataset and scalable automated +evaluation tools, with the aim of facilitating broader adoption and robust +assessment of these systems in the climate domain. Our findings highlight the +key components of responsible deployment of RAG to enhance decision-making, +while also providing insights into user experience (UX) considerations for +safely deploying such systems to build trust with users in high-risk domains. + +
+
+
+
+
+ + ☆ Leveraging LLMs for MT in Crisis Scenarios: a blueprint for low-resource + languages + + +
+ In an evolving landscape of crisis communication, the need for robust and +adaptable Machine Translation (MT) systems is more pressing than ever, +particularly for low-resource languages. This study presents a comprehensive +exploration of leveraging Large Language Models (LLMs) and Multilingual LLMs +(MLLMs) to enhance MT capabilities in such scenarios. By focusing on the unique +challenges posed by crisis situations where speed, accuracy, and the ability to +handle a wide range of languages are paramount, this research outlines a novel +approach that combines the cutting-edge capabilities of LLMs with fine-tuning +techniques and community-driven corpus development strategies. At the core of +this study is the development and empirical evaluation of MT systems tailored +for two low-resource language pairs, illustrating the process from initial +model selection and fine-tuning through to deployment. Bespoke systems are +developed and modelled on the recent Covid-19 pandemic. The research highlights +the importance of community involvement in creating highly specialised, +crisis-specific datasets and compares custom GPTs with NLLB-adapted MLLM +models. It identifies fine-tuned MLLM models as offering superior performance +compared with their LLM counterparts. A scalable and replicable model for rapid +MT system development in crisis scenarios is outlined. Our approach enhances +the field of humanitarian technology by offering a blueprint for developing +multilingual communication systems during emergencies. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.02370, + arXiv:2403.01580 +
+
+
+
+
+ + ☆ Failure Modes of LLMs for Causal Reasoning on Narratives + + +
+ In this work, we investigate the causal reasoning abilities of large language +models (LLMs) through the representative problem of inferring causal +relationships from narratives. We find that even state-of-the-art language +models rely on unreliable shortcuts, both in terms of the narrative +presentation and their parametric knowledge. For example, LLMs tend to +determine causal relationships based on the topological ordering of events +(i.e., earlier events cause later ones), resulting in lower performance +whenever events are not narrated in their exact causal order. Similarly, we +demonstrate that LLMs struggle with long-term causal reasoning and often fail +when the narratives are long and contain many events. Additionally, we show +LLMs appear to rely heavily on their parametric knowledge at the expense of +reasoning over the provided narrative. This degrades their abilities whenever +the narrative opposes parametric knowledge. We extensively validate these +failure modes through carefully controlled synthetic experiments, as well as +evaluations on real-world narratives. Finally, we observe that explicitly +generating a causal graph generally improves performance while naive +chain-of-thought is ineffective. Collectively, our results distill precise +failure modes of current state-of-the-art models and can pave the way for +future techniques to enhance causal reasoning in LLMs. + +
+
+
+
+
+ + ☆ 'No' Matters: Out-of-Distribution Detection in Multimodality Long + Dialogue + + +
+ Out-of-distribution (OOD) detection in multimodal contexts is essential for +identifying deviations in combined inputs from different modalities, +particularly in applications like open-domain dialogue systems or real-life +dialogue interactions. This paper aims to improve the user experience that +involves multi-round long dialogues by efficiently detecting OOD dialogues and +images. We introduce a novel scoring framework named Dialogue Image Aligning +and Enhancing Framework (DIAEF) that integrates the visual language models with +the novel proposed scores that detect OOD in two key scenarios (1) mismatches +between the dialogue and image input pair and (2) input pairs with previously +unseen labels. Our experimental results, derived from various benchmarks, +demonstrate that integrating image and multi-round dialogue OOD detection is +more effective with previously unseen labels than using either modality +independently. In the presence of mismatched pairs, our proposed score +effectively identifies these mismatches and demonstrates strong robustness in +long dialogues. This approach enhances domain-aware, adaptive conversational +agents and establishes baselines for future studies. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models + + +
+ Large Multimodal Models (LMMs) have demonstrated the ability to interact with +humans under real-world conditions by combining Large Language Models (LLMs) +and modality encoders to align multimodal information (visual and auditory) +with text. However, such models raise new safety challenges of whether models +that are safety-aligned on text also exhibit consistent safeguards for +multimodal inputs. Despite recent safety-alignment research on vision LMMs, the +safety of audio LMMs remains under-explored. In this work, we comprehensively +red team the safety of five advanced audio LMMs under three settings: (i) +harmful questions in both audio and text formats, (ii) harmful questions in +text format accompanied by distracting non-speech audio, and (iii) +speech-specific jailbreaks. Our results under these settings demonstrate that +open-source audio LMMs suffer an average attack success rate of 69.14% on +harmful audio questions, and exhibit safety vulnerabilities when distracted +with non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro +achieve an attack success rate of 70.67% on the harmful query benchmark. We +provide insights on what could cause these reported safety-misalignments. +Warning: this paper contains offensive examples. + +
+
+
+
+
+ + ☆ Can Language Models Perform Robust Reasoning in Chain-of-thought + Prompting with Noisy Rationales? NeurIPS 2024 + + +
+ This paper investigates an under-explored challenge in large language models +(LLMs): chain-of-thought prompting with noisy rationales, which include +irrelevant or inaccurate reasoning thoughts within examples used for in-context +learning. We construct NoRa dataset that is tailored to evaluate the robustness +of reasoning in the presence of noisy rationales. Our findings on NoRa dataset +reveal a prevalent vulnerability to such noise among current LLMs, with +existing robust methods like self-correction and self-consistency showing +limited efficacy. Notably, compared to prompting with clean rationales, base +LLM drops by 1.4%-19.8% in accuracy with irrelevant thoughts and more +drastically by 2.2%-40.4% with inaccurate thoughts. + Addressing this challenge necessitates external supervision that should be +accessible in practice. Here, we propose the method of contrastive denoising +with noisy chain-of-thought (CD-CoT). It enhances LLMs' denoising-reasoning +capabilities by contrasting noisy rationales with only one clean rationale, +which can be the minimal requirement for denoising-purpose prompting. This +method follows a principle of exploration and exploitation: (1) rephrasing and +selecting rationales in the input space to achieve explicit denoising and (2) +exploring diverse reasoning paths and voting on answers in the output space. +Empirically, CD-CoT demonstrates an average improvement of 17.8% in accuracy +over the base model and shows significantly stronger denoising capabilities +than baseline methods. The source code is publicly available at: +https://github.com/tmlr-group/NoisyRationales. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ The Automated Verification of Textual Claims (AVeriTeC) Shared Task + + +
+ The Automated Verification of Textual Claims (AVeriTeC) shared task asks +participants to retrieve evidence and predict veracity for real-world claims +checked by fact-checkers. Evidence can be found either via a search engine, or +via a knowledge store provided by the organisers. Submissions are evaluated +using AVeriTeC score, which considers a claim to be accurately verified if and +only if both the verdict is correct and retrieved evidence is considered to +meet a certain quality threshold. The shared task received 21 submissions, 18 +of which surpassed our baseline. The winning team was TUDA_MAI with an AVeriTeC +score of 63%. In this paper we describe the shared task, present the full +results, and highlight key takeaways from the shared task. + +
+
+
+
+
+ + ☆ Commonsense Knowledge Editing Based on Free-Text in LLMs + + +
+ Knowledge editing technology is crucial for maintaining the accuracy and +timeliness of large language models (LLMs) . However, the setting of this task +overlooks a significant portion of commonsense knowledge based on free-text in +the real world, characterized by broad knowledge scope, long content and non +instantiation. The editing objects of previous methods (e.g., MEMIT) were +single token or entity, which were not suitable for commonsense knowledge in +free-text form. To address the aforementioned challenges, we conducted +experiments from two perspectives: knowledge localization and knowledge +editing. Firstly, we introduced Knowledge Localization for Free-Text(KLFT) +method, revealing the challenges associated with the distribution of +commonsense knowledge in MLP and Attention layers, as well as in decentralized +distribution. Next, we propose a Dynamics-aware Editing Method(DEM), which +utilizes a Dynamics-aware Module to locate the parameter positions +corresponding to commonsense knowledge, and uses Knowledge Editing Module to +update knowledge. The DEM method fully explores the potential of the MLP and +Attention layers, and successfully edits commonsense knowledge based on +free-text. The experimental results indicate that the DEM can achieve excellent +editing performance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Reasons and Solutions for the Decline in Model Performance after Editing + + +
+ Knowledge editing technology has received widespread attention for low-cost +updates of incorrect or outdated knowledge in large-scale language models. +However, recent research has found that edited models often exhibit varying +degrees of performance degradation. The reasons behind this phenomenon and +potential solutions have not yet been provided. In order to investigate the +reasons for the performance decline of the edited model and optimize the +editing method, this work explores the underlying reasons from both data and +model perspectives. Specifically, 1) from a data perspective, to clarify the +impact of data on the performance of editing models, this paper first +constructs a Multi-Question Dataset (MQD) to evaluate the impact of different +types of editing data on model performance. The performance of the editing +model is mainly affected by the diversity of editing targets and sequence +length, as determined through experiments. 2) From a model perspective, this +article explores the factors that affect the performance of editing models. The +results indicate a strong correlation between the L1-norm of the editing model +layer and the editing accuracy, and clarify that this is an important factor +leading to the bottleneck of editing performance. Finally, in order to improve +the performance of the editing model, this paper further proposes a Dump for +Sequence (D4S) method, which successfully overcomes the previous editing +bottleneck by reducing the L1-norm of the editing layer, allowing users to +perform multiple effective edits and minimizing model damage. Our code is +available at https://github.com/nlpkeg/D4S. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ GlotCC: An Open Broad-Coverage CommonCrawl Corpus and Pipeline for + Minority Languages NeurIPS 2024 + + +
+ The need for large text corpora has increased with the advent of pretrained +language models and, in particular, the discovery of scaling laws for these +models. Most available corpora have sufficient data only for languages with +large dominant communities. However, there is no corpus available that (i) +covers a wide range of minority languages; (ii) is generated by an open-source +reproducible pipeline; and (iii) is rigorously cleaned from noise, making it +trustworthy to use. We present GlotCC, a clean, document-level, 2TB general +domain corpus derived from CommonCrawl, covering more than 1000 languages. We +make GlotCC and the system used to generate it - including the pipeline, +language identification model, and filters - available to the research +community. Corpus v. 1.0 https://huggingface.co/datasets/cis-lmu/GlotCC-v1, +Pipeline v. 3.0 https://github.com/cisnlp/GlotCC. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ What is Wrong with Perplexity for Long-context Language Modeling? + + +
+ Handling long-context inputs is crucial for large language models (LLMs) in +tasks such as extended conversations, document summarization, and many-shot +in-context learning. While recent approaches have extended the context windows +of LLMs and employed perplexity (PPL) as a standard evaluation metric, PPL has +proven unreliable for assessing long-context capabilities. The underlying cause +of this limitation has remained unclear. In this work, we provide a +comprehensive explanation for this issue. We find that PPL overlooks key +tokens, which are essential for long-context understanding, by averaging across +all tokens and thereby obscuring the true performance of models in long-context +scenarios. To address this, we propose \textbf{LongPPL}, a novel metric that +focuses on key tokens by employing a long-short context contrastive method to +identify them. Our experiments demonstrate that LongPPL strongly correlates +with performance on various long-context benchmarks (e.g., Pearson correlation +of -0.96), significantly outperforming traditional PPL in predictive accuracy. +Additionally, we introduce \textbf{LongCE} (Long-context Cross-Entropy) loss, a +re-weighting strategy for fine-tuning that prioritizes key tokens, leading to +consistent improvements across diverse benchmarks. In summary, these +contributions offer deeper insights into the limitations of PPL and present +effective solutions for accurately evaluating and enhancing the long-context +capabilities of LLMs. Code is available at https://github.com/PKU-ML/LongPPL. + +
+
+
+
+
+ + ☆ The Potential of LLMs in Medical Education: Generating Questions and + Answers for Qualification Exams + + +
+ Recent research on large language models (LLMs) has primarily focused on +their adaptation and application in specialized domains. The application of +LLMs in the medical field is mainly concentrated on tasks such as the +automation of medical report generation, summarization, diagnostic reasoning, +and question-and-answer interactions between doctors and patients. The +challenge of becoming a good teacher is more formidable than that of becoming a +good student, and this study pioneers the application of LLMs in the field of +medical education. In this work, we investigate the extent to which LLMs can +generate medical qualification exam questions and corresponding answers based +on few-shot prompts. Utilizing a real-world Chinese dataset of elderly chronic +diseases, we tasked the LLMs with generating open-ended questions and answers +based on a subset of sampled admission reports across eight widely used LLMs, +including ERNIE 4, ChatGLM 4, Doubao, Hunyuan, Spark 4, Qwen, Llama 3, and +Mistral. Furthermore, we engaged medical experts to manually evaluate these +open-ended questions and answers across multiple dimensions. The study found +that LLMs, after using few-shot prompts, can effectively mimic real-world +medical qualification exam questions, whereas there is room for improvement in +the correctness, evidence-based statements, and professionalism of the +generated answers. Moreover, LLMs also demonstrate a decent level of ability to +correct and rectify reference answers. Given the immense potential of +artificial intelligence in the medical field, the task of generating questions +and answers for medical qualification exams aimed at medical students, interns +and residents can be a significant focus of future research. + +
+
+
+
+
+ + ☆ DetectRL: Benchmarking LLM-Generated Text Detection in Real-World + Scenarios NeurIPS 2024 + + +
+ Detecting text generated by large language models (LLMs) is of great recent +interest. With zero-shot methods like DetectGPT, detection capabilities have +reached impressive levels. However, the reliability of existing detectors in +real-world applications remains underexplored. In this study, we present a new +benchmark, DetectRL, highlighting that even state-of-the-art (SOTA) detection +techniques still underperformed in this task. We collected human-written +datasets from domains where LLMs are particularly prone to misuse. Using +popular LLMs, we generated data that better aligns with real-world +applications. Unlike previous studies, we employed heuristic rules to create +adversarial LLM-generated text, simulating advanced prompt usages, human +revisions like word substitutions, and writing errors. Our development of +DetectRL reveals the strengths and limitations of current SOTA detectors. More +importantly, we analyzed the potential impact of writing styles, model types, +attack methods, the text lengths, and real-world human writing factors on +different types of detectors. We believe DetectRL could serve as an effective +benchmark for assessing detectors in real-world scenarios, evolving with +advanced attack methods, thus providing more stressful evaluation to drive the +development of more efficient detectors. Data and code are publicly available +at: https://github.com/NLP2CT/DetectRL. + +
+
+ comment: Accepted to NeurIPS 2024 Dataset & Benchmarking Track +
+
+
+
+
+ + ☆ What Happened in LLMs Layers when Trained for Fast vs. Slow Thinking: A + Gradient Perspective + + +
+ What makes a difference in the post-training of LLMs? We investigate the +training patterns of different layers in large language models (LLMs), through +the lens of gradient, when training with different responses and initial +models. We are specifically interested in how fast vs. slow thinking affects +the layer-wise gradients, given the recent popularity of training LLMs on +reasoning paths such as chain-of-thoughts (CoT) and process rewards. In our +study, fast thinking without CoT leads to larger gradients and larger +differences of gradients across layers than slow thinking (Detailed CoT), +indicating the learning stability brought by the latter. Moreover, pre-trained +LLMs are less affected by the instability of fast thinking than +instruction-tuned LLMs. Additionally, we study whether the gradient patterns +can reflect the correctness of responses when training different LLMs using +slow vs. fast thinking paths. The results show that the gradients of slow +thinking can distinguish correct and irrelevant reasoning paths. As a +comparison, we conduct similar gradient analyses on non-reasoning knowledge +learning tasks, on which, however, trivially increasing the response length +does not lead to similar behaviors of slow thinking. Our study strengthens +fundamental understandings of LLM training and sheds novel insights on its +efficiency and stability, which pave the way towards building a generalizable +System-2 agent. Our code, data, and gradient statistics can be found in: +https://github.com/MingLiiii/Layer_Gradient. + +
+
+
+
+
+ + ☆ GigaCheck: Detecting LLM-generated Content + + +
+ With the increasing quality and spread of LLM-based assistants, the amount of +artificially generated content is growing rapidly. In many cases and tasks, +such texts are already indistinguishable from those written by humans, and the +quality of generation tends to only increase. At the same time, detection +methods are developing more slowly, making it challenging to prevent misuse of +these technologies. + In this work, we investigate the task of generated text detection by +proposing the GigaCheck. Our research explores two approaches: (i) +distinguishing human-written texts from LLM-generated ones, and (ii) detecting +LLM-generated intervals in Human-Machine collaborative texts. For the first +task, our approach utilizes a general-purpose LLM, leveraging its extensive +language abilities to fine-tune efficiently for the downstream task of +LLM-generated text detection, achieving high performance even with limited +data. For the second task, we propose a novel approach that combines computer +vision and natural language processing techniques. Specifically, we use a +fine-tuned general-purpose LLM in conjunction with a DETR-like detection model, +adapted from computer vision, to localize artificially generated intervals +within text. + We evaluate the GigaCheck on five classification datasets with English texts +and three datasets designed for Human-Machine collaborative text analysis. Our +results demonstrate that GigaCheck outperforms previous methods, even in +out-of-distribution settings, establishing a strong baseline across all +datasets. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ☆ Artificial intelligence to improve clinical coding practice in + Scandinavia: a crossover randomized controlled trial + + +
+ \textbf{Trial design} Crossover randomized controlled trial. \textbf{Methods} +An AI tool, Easy-ICD, was developed to assist clinical coders and was tested +for improving both accuracy and time in a user study in Norway and Sweden. +Participants were randomly assigned to two groups, and crossed over between +coding complex (longer) texts versus simple (shorter) texts, while using our +tool versus not using our tool. \textbf{Results} Based on Mann-Whitney U test, +the median coding time difference for complex clinical text sequences was 123 +seconds (\emph{P}\textless.001, 95\% CI: 81 to 164), representing a 46\% +reduction in median coding time when our tool is used. There was no significant +time difference for simpler text sequences. For coding accuracy, the +improvement we noted for both complex and simple texts was not significant. +\textbf{Conclusions} This study demonstrates the potential of AI to transform +common tasks in clinical workflows, with ostensible positive impacts on work +efficiencies for complex clinical coding tasks. Further studies within hospital +workflows are required before these presumed impacts can be more clearly +understood. + +
+
+ comment: 13 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ OCEAN: Offline Chain-of-thought Evaluation and Alignment in Large + Language Models + + +
+ Offline evaluation of LLMs is crucial in understanding their capacities, +though current methods remain underexplored in existing research. In this work, +we focus on the offline evaluation of the chain-of-thought capabilities and +show how to optimize LLMs based on the proposed evaluation method. To enable +offline feedback with rich knowledge and reasoning paths, we use knowledge +graphs (e.g., Wikidata5m) to provide feedback on the generated chain of +thoughts. Due to the heterogeneity between LLM reasoning and KG structures, +direct interaction and feedback from KGs on LLM behavior are challenging, as +they require accurate entity linking and grounding of LLM-generated chains of +thought in the KG. To address the above challenge, we propose an offline +chain-of-thought evaluation framework, OCEAN, which models chain-of-thought +reasoning in LLMs as an MDP and evaluate the policy's alignment with KG +preference modeling. To overcome the reasoning heterogeneity and grounding +problems, we leverage on-policy KG exploration and RL to model a KG policy that +generates token-level likelihood distributions for LLM-generated +chain-of-thought reasoning paths, simulating KG reasoning preference. Then we +incorporate the knowledge-graph feedback on the validity and alignment of the +generated reasoning paths into inverse propensity scores and propose KG-IPS +estimator. Theoretically, we prove the unbiasedness of the proposed KG-IPS +estimator and provide a lower bound on its variance. With the off-policy +evaluated value function, we can directly enable off-policy optimization to +further enhance chain-of-thought alignment. Our empirical study shows that +OCEAN can be efficiently optimized for generating chain-of-thought reasoning +paths with higher estimated values without affecting LLMs' general abilities in +downstream tasks or their internal knowledge. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Instruction-Tuning Llama-3-8B Excels in City-Scale Mobility Prediction + + +
+ Human mobility prediction plays a critical role in applications such as +disaster response, urban planning, and epidemic forecasting. Traditional +methods often rely on designing crafted, domain-specific models, and typically +focus on short-term predictions, which struggle to generalize across diverse +urban environments. In this study, we introduce Llama-3-8B-Mob, a large +language model fine-tuned with instruction tuning, for long-term citywide +mobility prediction -- in a Q&A manner. We validate our approach using +large-scale human mobility data from four metropolitan areas in Japan, focusing +on predicting individual trajectories over the next 15 days. The results +demonstrate that Llama-3-8B-Mob excels in modeling long-term human mobility -- +surpassing the state-of-the-art on multiple prediction metrics. It also +displays strong zero-shot generalization capabilities -- effectively +generalizing to other cities even when fine-tuned only on limited samples from +a single city. Source codes are available at +https://github.com/TANGHULU6/Llama3-8B-Mob. + +
+
+
+
+
+ + ☆ Improbable Bigrams Expose Vulnerabilities of Incomplete Tokens in + Byte-Level Tokenizers + + +
+ Tokenization is a crucial step that bridges human-readable text with +model-readable discrete tokens. However, recent studies have revealed that +tokenizers can be exploited to elicit unwanted model behaviors. In this work, +we investigate incomplete tokens, i.e., undecodable tokens with stray bytes +resulting from byte-level byte-pair encoding (BPE) tokenization. We hypothesize +that such tokens are heavily reliant on their adjacent tokens and are fragile +when paired with unfamiliar tokens. To demonstrate this vulnerability, we +introduce improbable bigrams: out-of-distribution combinations of incomplete +tokens designed to exploit their dependency. Our experiments show that +improbable bigrams are significantly prone to hallucinatory behaviors. +Surprisingly, alternative tokenizations of the same phrases result in +drastically lower rates of hallucination (93% reduction in Llama3.1). We +caution against the potential vulnerabilities introduced by byte-level BPE +tokenizers, which may impede the development of trustworthy language models. + +
+
+
+
+
+ + ♻ ☆ Word Definitions from Large Language Models + + +
+ Dictionary definitions are historically the arbitrator of what words mean, +but this primacy has come under threat by recent progress in NLP, including +word embeddings and generative models like ChatGPT. We present an exploratory +study of the degree of alignment between word definitions from classical +dictionaries and these newer computational artifacts. Specifically, we compare +definitions from three published dictionaries to those generated from variants +of ChatGPT. We show that (i) definitions from different traditional +dictionaries exhibit more surface form similarity than do model-generated +definitions, (ii) that the ChatGPT definitions are highly accurate, comparable +to traditional dictionaries, and (iii) ChatGPT-based embedding definitions +retain their accuracy even on low frequency words, much better than GloVE and +FastText word embeddings. + +
+
+
+
+
+ + ♻ ☆ SemCoder: Training Code Language Models with Comprehensive Semantics + Reasoning NeurIPS 2024 + + +
+ Code Large Language Models (Code LLMs) have excelled at tasks like code +completion but often miss deeper semantics such as execution effects and +dynamic states. This paper aims to bridge the gap between Code LLMs' reliance +on static text data and the need for semantic understanding for complex tasks +like debugging and program repair. We introduce a novel strategy, monologue +reasoning, to train Code LLMs to reason comprehensive semantics, encompassing +high-level functional descriptions, local execution effects of individual +statements, and overall input/output behavior, thereby linking static code text +with dynamic execution states. We begin by collecting PyX, a clean Python +corpus of fully executable code samples with functional descriptions and test +cases. We propose training Code LLMs not only to write code but also to +understand code semantics by reasoning about key properties, constraints, and +execution behaviors using natural language, mimicking human verbal debugging, +i.e., rubber-duck debugging. This approach led to the development of SemCoder, +a Code LLM with only 6.7B parameters, which shows competitive performance with +GPT-3.5-turbo on code generation and execution reasoning tasks. SemCoder +achieves 79.3% on HumanEval (GPT-3.5-turbo: 76.8%), 63.6% on CRUXEval-I +(GPT-3.5-turbo: 50.3%), and 63.9% on CRUXEval-O (GPT-3.5-turbo: 59.0%). We also +study the effectiveness of SemCoder's monologue-style execution reasoning +compared to concrete scratchpad reasoning, showing that our approach integrates +semantics from multiple dimensions more smoothly. Finally, we demonstrate the +potential of applying learned semantics to improve Code LLMs' debugging and +self-refining capabilities. Our data, code, and models are available at: +https://github.com/ARiSE-Lab/SemCoder. + +
+
+ comment: NeurIPS 2024 Camera-ready +
+
+
+
+
+ + ♻ ☆ RACCooN: A Versatile Instructional Video Editing Framework with + Auto-Generated Narratives + + +
+ Recent video generative models primarily rely on carefully written text +prompts for specific tasks, like inpainting or style editing. They require +labor-intensive textual descriptions for input videos, hindering their +flexibility to adapt personal/raw videos to user specifications. This paper +proposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video +generative framework that supports multiple video editing capabilities such as +removal, addition, and modification, through a unified pipeline. RACCooN +consists of two principal stages: Video-to-Paragraph (V2P) and +Paragraph-to-Video (P2V). In the V2P stage, we automatically describe video +scenes in well-structured natural language, capturing both the holistic context +and focused object details. Subsequently, in the P2V stage, users can +optionally refine these descriptions to guide the video diffusion model, +enabling various modifications to the input video, such as removing, changing +subjects, and/or adding new objects. The proposed approach stands out from +other methods through several significant contributions: (1) RACCooN suggests a +multi-granular spatiotemporal pooling strategy to generate well-structured +video descriptions, capturing both the broad context and object details without +requiring complex human annotations, simplifying precise video content editing +based on text for users. (2) Our video generative model incorporates +auto-generated narratives or instructions to enhance the quality and accuracy +of the generated content. (3) RACCooN also plans to imagine new objects in a +given video, so users simply prompt the model to receive a detailed video +editing plan for complex video editing. The proposed framework demonstrates +impressive versatile capabilities in video-to-paragraph generation, video +content editing, and can be incorporated into other SoTA video generative +models for further enhancement. + +
+
+ comment: The first two authors contribute equally. Project Page: + https://raccoon-mllm-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ Judging the Judges: A Systematic Investigation of Position Bias in + Pairwise Comparative Assessments by LLMs + + +
+ LLM-as-a-Judge presents a promising alternative to human evaluators across +various tasks, but inherent biases, especially position bias - a tendency to +favor solutions based on their position in the prompt - have compromised its +effectiveness. Our study introduces a systematic framework to examine position +bias in pairwise comparisons, focusing on repetition stability, position +consistency, and preference fairness. This research significantly contributes +to the field by introducing new concepts for understanding position bias and +providing a multi-dimensional framework for evaluations. We conducted +experiments with 12 LLM judges across MTBench and DevBench, covering 22 tasks +and approximately 40 solution-generating models - candidates, resulting in over +100,000 evaluation instances. Our findings confirm that position bias in +capable LLM judges is not due to random chances, along with notable variations +observed across judges and tasks. Moreover, position bias is weakly influenced +by the length of prompt components but significantly impacted by the quality +gap between solutions. These insights can help optimize judge model selections, +improve benchmark design, and inform future research on debiasing strategies, +ultimately enhancing the reliability of LLM judges. + +
+
+
+
+
+ + ♻ ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ♻ ☆ Elliptical Attention NeurIPS 2024 + + +
+ Pairwise dot-product self-attention is key to the success of transformers +that achieve state-of-the-art performance across a variety of applications in +language and vision. This dot-product self-attention computes attention weights +among the input tokens using Euclidean distance, which makes the model prone to +representation collapse and vulnerable to contaminated samples. In this paper, +we propose using a Mahalanobis distance metric for computing the attention +weights to stretch the underlying feature space in directions of high +contextual relevance. In particular, we define a hyper-ellipsoidal neighborhood +around each query to increase the attention weights of the tokens lying in the +contextually important directions. We term this novel class of attention +Elliptical Attention. Our Elliptical Attention provides two benefits: 1) +reducing representation collapse and 2) enhancing the model's robustness as +Elliptical Attention pays more attention to contextually relevant information +rather than focusing on some small subset of informative features. We +empirically demonstrate the advantages of Elliptical Attention over the +baseline dot-product attention and state-of-the-art attention methods on +various practical tasks, including object classification, image segmentation, +and language modeling across different data modalities. + +
+
+ comment: 10 pages in the main text. Published at NeurIPS 2024. The code is + available at https://github.com/stefvk/Elliptical-Attention +
+
+
+
+
+ + ♻ ☆ End-to-end streaming model for low-latency speech anonymization + + +
+ Speaker anonymization aims to conceal cues to speaker identity while +preserving linguistic content. Current machine learning based approaches +require substantial computational resources, hindering real-time streaming +applications. To address these concerns, we propose a streaming model that +achieves speaker anonymization with low latency. The system is trained in an +end-to-end autoencoder fashion using a lightweight content encoder that +extracts HuBERT-like information, a pretrained speaker encoder that extract +speaker identity, and a variance encoder that injects pitch and energy +information. These three disentangled representations are fed to a decoder that +re-synthesizes the speech signal. We present evaluation results from two +implementations of our system, a full model that achieves a latency of 230ms, +and a lite version (0.1x in size) that further reduces latency to 66ms while +maintaining state-of-the-art performance in naturalness, intelligibility, and +privacy preservation. + +
+
+
+
+
+ + ♻ ☆ CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale + + +
+ Measuring biodiversity is crucial for understanding ecosystem health. While +prior works have developed machine learning models for taxonomic classification +of photographic images and DNA separately, in this work, we introduce a +multimodal approach combining both, using CLIP-style contrastive learning to +align images, barcode DNA, and text-based representations of taxonomic labels +in a unified embedding space. This allows for accurate classification of both +known and unknown insect species without task-specific fine-tuning, leveraging +contrastive learning for the first time to fuse DNA and image data. Our method +surpasses previous single-modality approaches in accuracy by over 8% on +zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. + +
+
+ comment: 25 pages with 11 figures +
+
+
+
+
+ + ♻ ☆ SparseLLM: Towards Global Pruning for Pre-trained Language Models NeurIPS 2024 + + +
+ The transformative impact of large language models (LLMs) like LLaMA and GPT +on natural language processing is countered by their prohibitive computational +demands. Pruning has emerged as a pivotal compression strategy, introducing +sparsity to enhance both memory and computational efficiency. Yet, traditional +global pruning is impractical for LLMs due to scalability issues, while local +pruning, despite its efficiency, leads to suboptimal solutions. Addressing +these challenges, we propose SparseLLM, a novel framework that redefines the +global pruning process into manageable, coordinated subproblems, allowing for +resource-efficient optimization with global optimality. SparseLLM's approach, +which conceptualizes LLMs as a chain of modular functions and leverages +auxiliary variables for problem decomposition, not only facilitates a pragmatic +application on LLMs but also demonstrates significant performance improvements, +particularly in high-sparsity regimes where it surpasses current +state-of-the-art methods. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Parallelizing Linear Transformers with the Delta Rule over Sequence + Length NeurIPS 2024 + + +
+ Transformers with linear attention (i.e., linear transformers) and +state-space models have recently been suggested as a viable linear-time +alternative to transformers with softmax attention. However, these models still +underperform transformers especially on tasks that require in-context +retrieval. While more expressive variants of linear transformers which replace +the additive update in linear transformers with the delta rule (DeltaNet) have +been found to be more effective at associative recall, existing algorithms for +training such models do not parallelize over sequence length and are thus +inefficient to train on modern hardware. This work describes a +hardware-efficient algorithm for training linear transformers with the delta +rule, which exploits a memory-efficient representation for computing products +of Householder matrices. This algorithm allows us to scale up DeltaNet to +standard language modeling settings. We train a 1.3B model for 100B tokens and +find that it outperforms recent linear-time baselines such as Mamba and GLA in +terms of perplexity and zero-shot performance on downstream tasks. We also +experiment with two hybrid models which combine DeltaNet layers with (1) +sliding-window attention layers every other layer or (2) two global attention +layers, and find that these hybrids outperform strong transformer baselines. + +
+
+ comment: NeurIPS 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs on Entity Disambiguation in Tables + + +
+ Tables are crucial containers of information, but understanding their meaning +may be challenging. Over the years, there has been a surge in interest in +data-driven approaches based on deep learning that have increasingly been +combined with heuristic-based ones. In the last period, the advent of +\acf{llms} has led to a new category of approaches for table annotation. +However, these approaches have not been consistently evaluated on a common +ground, making evaluation and comparison difficult. This work proposes an +extensive evaluation of four STI SOTA approaches: Alligator (formerly s-elbat), +Dagobah, TURL, and TableLlama; the first two belong to the family of +heuristic-based algorithms, while the others are respectively encoder-only and +decoder-only Large Language Models (LLMs). We also include in the evaluation +both GPT-4o and GPT-4o-mini, since they excel in various public benchmarks. The +primary objective is to measure the ability of these approaches to solve the +entity disambiguation task with respect to both the performance achieved on a +common-ground evaluation setting and the computational and cost requirements +involved, with the ultimate aim of charting new research paths in the field. + +
+
+ comment: 13 pages, 6 figures; fixed avg. accuracy-over-price plot for GPT + families, fixed typos in table referencing, added evaluation and inference + subsubsection +
+
+
+
+
+ + ♻ ☆ FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven + Question Answering Pipeline + + +
+ Financial decision-making hinges on the analysis of relevant information +embedded in the enormous volume of documents in the financial domain. To +address this challenge, we developed FinQAPT, an end-to-end pipeline that +streamlines the identification of relevant financial reports based on a query, +extracts pertinent context, and leverages Large Language Models (LLMs) to +perform downstream tasks. To evaluate the pipeline, we experimented with +various techniques to optimize the performance of each module using the FinQA +dataset. We introduced a novel clustering-based negative sampling technique to +enhance context extraction and a novel prompting method called Dynamic N-shot +Prompting to boost the numerical question-answering capabilities of LLMs. At +the module level, we achieved state-of-the-art accuracy on FinQA, attaining an +accuracy of 80.6%. However, at the pipeline level, we observed decreased +performance due to challenges in extracting relevant context from financial +reports. We conducted a detailed error analysis of each module and the +end-to-end pipeline, pinpointing specific challenges that must be addressed to +develop a robust solution for handling complex financial tasks. + +
+
+ comment: Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Fast Best-of-N Decoding via Speculative Rejection NeurIPS 2024 + + +
+ The safe and effective deployment of Large Language Models (LLMs) involves a +critical step called alignment, which ensures that the model's responses are in +accordance with human preferences. Prevalent alignment techniques, such as DPO, +PPO and their variants, align LLMs by changing the pre-trained model weights +during a phase called post-training. While predominant, these post-training +methods add substantial complexity before LLMs can be deployed. Inference-time +alignment methods avoid the complex post-training step and instead bias the +generation towards responses that are aligned with human preferences. The +best-known inference-time alignment method, called Best-of-N, is as effective +as the state-of-the-art post-training procedures. Unfortunately, Best-of-N +requires vastly more resources at inference time than standard decoding +strategies, which makes it computationally not viable. In this work, we +introduce Speculative Rejection, a computationally-viable inference-time +alignment algorithm. It generates high-scoring responses according to a given +reward model, like Best-of-N does, while being between 16 to 32 times more +computationally efficient. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Probing Language Models on Their Knowledge Source EMNLP2024 + + +
+ Large Language Models (LLMs) often encounter conflicts between their learned, +internal (parametric knowledge, PK) and external knowledge provided during +inference (contextual knowledge, CK). Understanding how LLMs models prioritize +one knowledge source over the other remains a challenge. In this paper, we +propose a novel probing framework to explore the mechanisms governing the +selection between PK and CK in LLMs. Using controlled prompts designed to +contradict the model's PK, we demonstrate that specific model activations are +indicative of the knowledge source employed. We evaluate this framework on +various LLMs of different sizes and demonstrate that mid-layer activations, +particularly those related to relations in the input, are crucial in predicting +knowledge source selection, paving the way for more reliable models capable of +handling knowledge conflicts effectively. + +
+
+ comment: Accepted at BlackBoxNLP@EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Multi-Object Hallucination in Vision-Language Models NeurIPS 2024 + + +
+ Large vision language models (LVLMs) often suffer from object hallucination, +producing objects not present in the given images. While current benchmarks for +object hallucination primarily concentrate on the presence of a single object +class rather than individual entities, this work systematically investigates +multi-object hallucination, examining how models misperceive (e.g., invent +nonexistent objects or become distracted) when tasked with focusing on multiple +objects simultaneously. We introduce Recognition-based Object Probing +Evaluation (ROPE), an automated evaluation protocol that considers the +distribution of object classes within a single image during testing and uses +visual referring prompts to eliminate ambiguity. With comprehensive empirical +studies and analysis of potential factors leading to multi-object +hallucination, we found that (1). LVLMs suffer more hallucinations when +focusing on multiple objects compared to a single object. (2). The tested +object class distribution affects hallucination behaviors, indicating that +LVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory +behaviors are influenced by data-specific factors, salience and frequency, and +model intrinsic behaviors. We hope to enable LVLMs to recognize and reason +about multiple objects that often occur in realistic visual scenes, provide +insights, and quantify our progress towards mitigating the issues. + +
+
+ comment: Accepted to NeurIPS 2024 | Project page: + https://multi-object-hallucination.github.io/ +
+
+
+
+
+ + ♻ ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ♻ ☆ Models Can and Should Embrace the Communicative Nature of + Human-Generated Math + + +
+ Math is constructed by people for people: just as natural language corpora +reflect not just propositions but the communicative goals of language users, +the math data that models are trained on reflects not just idealized +mathematical entities but rich communicative intentions. While there are +important advantages to treating math in a purely symbolic manner, we here +hypothesize that there are benefits to treating math as situated linguistic +communication and that language models are well suited for this goal, in ways +that are not fully appreciated. We illustrate these points with two case +studies. First, we ran an experiment in which we found that language models +interpret the equals sign in a humanlike way -- generating systematically +different word problems for the same underlying equation arranged in different +ways. Second, we found that language models prefer proofs to be ordered in +naturalistic ways, even though other orders would be logically equivalent. We +advocate for AI systems that learn from and represent the communicative +intentions latent in human-generated math. + +
+
+
+
+
+ + ♻ ☆ Do they mean 'us'? Interpreting Referring Expressions in Intergroup Bias EMNLP 2024 + + +
+ The variations between in-group and out-group speech (intergroup bias) are +subtle and could underlie many social phenomena like stereotype perpetuation +and implicit bias. In this paper, we model the intergroup bias as a tagging +task on English sports comments from forums dedicated to fandom for NFL teams. +We curate a unique dataset of over 6 million game-time comments from opposing +perspectives (the teams in the game), each comment grounded in a non-linguistic +description of the events that precipitated these comments (live win +probabilities for each team). Expert and crowd annotations justify modeling the +bias through tagging of implicit and explicit referring expressions and reveal +the rich, contextual understanding of language and the world required for this +task. For large-scale analysis of intergroup variation, we use LLMs for +automated tagging, and discover that some LLMs perform best when prompted with +linguistic descriptions of the win probability at the time of the comment, +rather than numerical probability. Further, large-scale tagging of comments +using LLMs uncovers linear variations in the form of referent across win +probabilities that distinguish in-group and out-group utterances. Code and data +are available at https://github.com/venkatasg/intergroup-nfl . + +
+
+ comment: Accepted to Findings@EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Optimization Bias of Next-Token Prediction in Linear Models + + +
+ We initiate an investigation into the optimization properties of next-token +prediction (NTP), the dominant training paradigm for modern language models. +Specifically, we study the structural properties of the solutions selected by +gradient-based optimizers among the many possible minimizers of the NTP +objective. By framing NTP as cross-entropy minimization across distinct +contexts, each tied with a sparse conditional probability distribution across a +finite vocabulary of tokens, we introduce "NTP-separability conditions" that +enable reaching the data-entropy lower bound. With this setup, and focusing on +linear models with fixed context embeddings, we characterize the optimization +bias of gradient descent (GD): Within the data subspace defined by the sparsity +patterns of distinct contexts, GD selects parameters that equate the logits' +differences of in-support tokens to their log-odds. In the orthogonal subspace, +the GD parameters diverge in norm and select the direction that maximizes a +margin specific to NTP. These findings extend previous research on implicit +bias in one-hot classification to the NTP setting, highlighting key differences +and prompting further research into the optimization and generalization +properties of NTP, irrespective of the specific architecture used to generate +the context embeddings. + +
+
+ comment: v2: fixed typos and writing in various parts; updated figures and + future-work section +
+
+
+
+
+ + ♻ ☆ Benchmarking LLMs via Uncertainty Quantification NeurIPS 2024 + + +
+ The proliferation of open-source Large Language Models (LLMs) from various +institutions has highlighted the urgent need for comprehensive evaluation +methods. However, current evaluation platforms, such as the widely recognized +HuggingFace open LLM leaderboard, neglect a crucial aspect -- uncertainty, +which is vital for thoroughly assessing LLMs. To bridge this gap, we introduce +a new benchmarking approach for LLMs that integrates uncertainty +quantification. Our examination involves nine LLMs (LLM series) spanning five +representative natural language processing tasks. Our findings reveal that: I) +LLMs with higher accuracy may exhibit lower certainty; II) Larger-scale LLMs +may display greater uncertainty compared to their smaller counterparts; and +III) Instruction-finetuning tends to increase the uncertainty of LLMs. These +results underscore the significance of incorporating uncertainty in the +evaluation of LLMs. + +
+
+ comment: 30 pages, accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Measuring Sound Symbolism in Audio-visual Models + + +
+ Audio-visual pre-trained models have gained substantial attention recently +and demonstrated superior performance on various audio-visual tasks. This study +investigates whether pre-trained audio-visual models demonstrate non-arbitrary +associations between sounds and visual representations$\unicode{x2013}$known as +sound symbolism$\unicode{x2013}$which is also observed in humans. We developed +a specialized dataset with synthesized images and audio samples and assessed +these models using a non-parametric approach in a zero-shot setting. Our +findings reveal a significant correlation between the models' outputs and +established patterns of sound symbolism, particularly in models trained on +speech data. These results suggest that such models can capture sound-meaning +connections akin to human language processing, providing insights into both +cognitive architectures and machine learning strategies. + +
+
+ comment: Errors in the introduction part that might potentially affect the + integrity of the paper. Withdraw at the point. Will replace with an updated + version in the future +
+
+
+
+
+ + ♻ ☆ BERTs are Generative In-Context Learners NeurIPS 2024 + + +
+ While in-context learning is commonly associated with causal language models, +such as GPT, we demonstrate that this capability also 'emerges' in masked +language models. Through an embarrassingly simple inference technique, we +enable an existing masked model, DeBERTa, to perform generative tasks without +additional training or architectural changes. Our evaluation reveals that the +masked and causal language models behave very differently, as they clearly +outperform each other on different categories of tasks. These complementary +strengths suggest that the field's focus on causal models for in-context +learning may be limiting - both architectures can develop these capabilities, +but with distinct advantages; pointing toward promising hybrid approaches that +combine the strengths of both objectives. + +
+
+ comment: 26 pages, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Revealing Fine-Grained Values and Opinions in Large Language Models EMNLP 2024 + + +
+ Uncovering latent values and opinions embedded in large language models +(LLMs) can help identify biases and mitigate potential harm. Recently, this has +been approached by prompting LLMs with survey questions and quantifying the +stances in the outputs towards morally and politically charged statements. +However, the stances generated by LLMs can vary greatly depending on how they +are prompted, and there are many ways to argue for or against a given position. +In this work, we propose to address this by analysing a large and robust +dataset of 156k LLM responses to the 62 propositions of the Political Compass +Test (PCT) generated by 6 LLMs using 420 prompt variations. We perform +coarse-grained analysis of their generated stances and fine-grained analysis of +the plain text justifications for those stances. For fine-grained analysis, we +propose to identify tropes in the responses: semantically similar phrases that +are recurrent and consistent across different prompts, revealing natural +patterns in the text that a given LLM is prone to produce. We find that +demographic features added to prompts significantly affect outcomes on the PCT, +reflecting bias, as well as disparities between the results of tests when +eliciting closed-form vs. open domain responses. Additionally, patterns in the +plain text rationales via tropes show that similar justifications are +repeatedly generated across models and prompts even with disparate stances. + +
+
+ comment: Findings of EMNLP 2024; 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Mathematical Formalized Problem Solving and Theorem Proving in Different + Fields in Lean 4 + + +
+ Using computerized verifiable formal languages like Lean 4 to prove +mathematical theorems has a significant impact on mathematical formalization. +Lean 4 offers prominent potential for advancing mathematical reasoning. +However, existing efforts are limited to mathematical formalization languages +in substantial online corpora and are dedicated to keeping pace with rapidly +evolving languages. To bridge the gap between the traditional and computerized +proof, my approach to formalizing theorem proving involves generating formal +steps and complete proofs using Large Language Models (LLMs) based on Natural +Language (NL) proofs. The method is to introduce the basic structure and +tactics in general, determine how AI can assist the mathematical formalization +process to improve its performance, and give examples of solving problems in +Lean 4 comparing to NL, mainly in IMO, and a sample theorem proving in abstract +algebra. + +
+
+
+
+
+ + ♻ ☆ Tree of Attacks: Jailbreaking Black-Box LLMs Automatically NeurIPS 2024 + + +
+ While Large Language Models (LLMs) display versatile functionality, they +continue to generate harmful, biased, and toxic content, as demonstrated by the +prevalence of human-designed jailbreaks. In this work, we present Tree of +Attacks with Pruning (TAP), an automated method for generating jailbreaks that +only requires black-box access to the target LLM. TAP utilizes an attacker LLM +to iteratively refine candidate (attack) prompts until one of the refined +prompts jailbreaks the target. In addition, before sending prompts to the +target, TAP assesses them and prunes the ones unlikely to result in jailbreaks, +reducing the number of queries sent to the target LLM. In empirical +evaluations, we observe that TAP generates prompts that jailbreak +state-of-the-art LLMs (including GPT4-Turbo and GPT4o) for more than 80% of the +prompts. This significantly improves upon the previous state-of-the-art +black-box methods for generating jailbreaks while using a smaller number of +queries than them. Furthermore, TAP is also capable of jailbreaking LLMs +protected by state-of-the-art guardrails, e.g., LlamaGuard. + +
+
+ comment: Accepted for presentation at NeurIPS 2024. Code: + https://github.com/RICommunity/TAP +
+
+
+
+
+ + ♻ ☆ Need a Small Specialized Language Model? Plan Early! + + +
+ Large language models are versatile tools but are not suitable for small +inference budgets. Small models have more efficient inference, but their lower +capacity means that their performance can be good only if one limits their +scope to a specialized domain. This paper explores how to get good specialized +small language models using a large, generic, pretraining set and a limited +amount of specialized data. We consider two scenarios, depending on whether (i) +one can afford pretraining a model for each specialization task, or (ii) one +wants to cheaply adapt a single pretrained model for each task. In the first +scenario, we propose an effective solution based on importance sampling: we +resample the pretraining set to imitate the specialization data and train a +small model on it. In the second scenario, we propose a novel architecture, +projected networks (PN). PN is a large network whose parameters can be linearly +projected into a small network for specialization. For both scenarios, we +demonstrate the empirical effectiveness of our solutions across various +domains, training set sizes, and training budgets. + +
+
+
+
+
+ + ♻ ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models are available in + https://github.com/MIO-Team/MIO +
+
+
+
+
+ + ♻ ☆ Preference Learning Algorithms Do Not Learn Preference Rankings NeurIPS 2024 + + +
+ Preference learning algorithms (e.g., RLHF and DPO) are frequently used to +steer LLMs to produce generations that are more preferred by humans, but our +understanding of their inner workings is still limited. In this work, we study +the conventional wisdom that preference learning trains models to assign higher +likelihoods to more preferred outputs than less preferred outputs, measured via +ranking accuracy. Surprisingly, we find that most state-of-the-art +preference-tuned models achieve a ranking accuracy of less than 60% on common +preference datasets. We furthermore derive the idealized ranking accuracy that +a preference-tuned LLM would achieve if it optimized the DPO or RLHF objective +perfectly. We demonstrate that existing models exhibit a significant alignment +gap -- i.e., a gap between the observed and idealized ranking accuracies. We +attribute this discrepancy to the DPO objective, which is empirically and +theoretically ill-suited to fix even mild ranking errors in the reference +model, and derive a simple and efficient formula for quantifying the difficulty +of learning a given preference datapoint. Finally, we demonstrate that ranking +accuracy strongly correlates with the empirically popular win rate metric when +the model is close to the reference model used in the objective, shedding +further light on the differences between on-policy (e.g., RLHF) and off-policy +(e.g., DPO) preference learning algorithms. + +
+
+ comment: NeurIPS 2024 camera-ready +
+
+
+
+
+ + ♻ ☆ OmniJARVIS: Unified Vision-Language-Action Tokenization Enables + Open-World Instruction Following Agents NeurIPS 2024 + + +
+ This paper presents OmniJARVIS, a novel Vision-Language-Action (VLA) model +for open-world instruction-following agents in Minecraft. Compared to prior +works that either emit textual goals to separate controllers or produce the +control command directly, OmniJARVIS seeks a different path to ensure both +strong reasoning and efficient decision-making capabilities via unified +tokenization of multimodal interaction data. First, we introduce a +self-supervised approach to learn a behavior encoder that produces discretized +tokens for behavior trajectories $\tau = \{o_0, a_0, \dots\}$ and an imitation +learning policy decoder conditioned on these tokens. These additional behavior +tokens will be augmented to the vocabulary of pretrained Multimodal Language +Models. With this encoder, we then pack long-term multimodal interactions +involving task instructions, memories, thoughts, observations, textual +responses, behavior trajectories, etc into unified token sequences and model +them with autoregressive transformers. Thanks to the semantically meaningful +behavior tokens, the resulting VLA model, OmniJARVIS, can reason (by producing +chain-of-thoughts), plan, answer questions, and act (by producing behavior +tokens for the imitation learning policy decoder). OmniJARVIS demonstrates +excellent performances on a comprehensive collection of atomic, programmatic, +and open-ended tasks in open-world Minecraft. Our analysis further unveils the +crucial design principles in interaction data formation, unified tokenization, +and its scaling potentials. The dataset, models, and code will be released at +https://craftjarvis.org/OmniJARVIS. + +
+
+ comment: accepted on NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Personalization in Language Models: A Systematic Study EMNLP 2024 + + +
+ Implicit Personalization (IP) is a phenomenon of language models inferring a +user's background from the implicit cues in the input prompts and tailoring the +response based on this inference. While previous work has touched upon various +instances of this problem, there lacks a unified framework to study this +behavior. This work systematically studies IP through a rigorous mathematical +formulation, a multi-perspective moral reasoning framework, and a set of case +studies. Our theoretical foundation for IP relies on a structural causal model +and introduces a novel method, indirect intervention, to estimate the causal +effect of a mediator variable that cannot be directly intervened upon. Beyond +the technical approach, we also introduce a set of moral reasoning principles +based on three schools of moral philosophy to study when IP may or may not be +ethically appropriate. Equipped with both mathematical and ethical insights, we +present three diverse case studies illustrating the varied nature of the IP +problem and offer recommendations for future research. Our code is at +https://github.com/jiarui-liu/IP, and our data is at +https://huggingface.co/datasets/Jerry999/ImplicitPersonalizationData. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Will LLMs Replace the Encoder-Only Models in Temporal Relation + Classification? + + +
+ The automatic detection of temporal relations among events has been mainly +investigated with encoder-only models such as RoBERTa. Large Language Models +(LLM) have recently shown promising performance in temporal reasoning tasks +such as temporal question answering. Nevertheless, recent studies have tested +the LLMs' performance in detecting temporal relations of closed-source models +only, limiting the interpretability of those results. In this work, we +investigate LLMs' performance and decision process in the Temporal Relation +Classification task. First, we assess the performance of seven open and +closed-sourced LLMs experimenting with in-context learning and lightweight +fine-tuning approaches. Results show that LLMs with in-context learning +significantly underperform smaller encoder-only models based on RoBERTa. Then, +we delve into the possible reasons for this gap by applying explainable +methods. The outcome suggests a limitation of LLMs in this task due to their +autoregressive nature, which causes them to focus only on the last part of the +sequence. Additionally, we evaluate the word embeddings of these two models to +better understand their pre-training differences. The code and the fine-tuned +models can be found respectively on GitHub. + +
+
+
+
+
+ + ♻ ☆ Structure-aware Domain Knowledge Injection for Large Language Models + + +
+ This paper introduces a pioneering methodology, termed StructTuning, to +efficiently transform foundation Large Language Models (LLMs) into domain +specialists. It significantly reduces the training corpus requirement to a mere +0.3%, while achieving an impressive 50% of traditional knowledge injection +performance. Our method is inspired by the educational processes of human +students, particularly how structured domain knowledge from textbooks is +assimilated and subsequently applied to tackle real-world challenges through +specific exercises. Based on this, we propose a novel two-stage strategy for +knowledge injection and alignment: Structure-aware Continual Pre-Training +(SCPT) and Structure-aware Supervised Fine-Tuning (SSFT). In the SCPT phase, we +automatically extract the domain knowledge taxonomy and reorganize the training +corpora, enabling LLMs to effectively link textual segments to targeted +knowledge points within the taxonomy. In the SSFT phase, we explicitly prompt +models to elucidate the underlying knowledge structure in their outputs, +leveraging the structured domain insight to address practical problems. Our +ultimate method has undergone extensive evaluations across model architectures +and scales, using closed-book question-answering tasks on LongBench and +MMedBench datasets. Remarkably, our method demonstrates the potential of +comparable improvement against the state-of-the-art MMedLM2 on MMedBench, while +significantly reducing the training costs to 5%. This breakthrough paves the +way for scaling up our StructTuning for stronger domain-specific LLMs with +comprehensive data utilization. Code is available at +https://github.com/alibaba/struxgpt. + +
+
+ comment: Preprint. Code is available at https://github.com/alibaba/struxgpt +
+
+
+
+
+ + ♻ ☆ AMBROSIA: A Benchmark for Parsing Ambiguous Questions into Database + Queries NeurIPS 2024 + + +
+ Practical semantic parsers are expected to understand user utterances and map +them to executable programs, even when these are ambiguous. We introduce a new +benchmark, AMBROSIA, which we hope will inform and inspire the development of +text-to-SQL parsers capable of recognizing and interpreting ambiguous requests. +Our dataset contains questions showcasing three different types of ambiguity +(scope ambiguity, attachment ambiguity, and vagueness), their interpretations, +and corresponding SQL queries. In each case, the ambiguity persists even when +the database context is provided. This is achieved through a novel approach +that involves controlled generation of databases from scratch. We benchmark +various LLMs on AMBROSIA, revealing that even the most advanced models struggle +to identify and interpret ambiguity in questions. + +
+
+ comment: NeurIPS 2024 D&B Track Spotlight +
+
+
+
+
+ + ♻ ☆ Gated Slot Attention for Efficient Linear-Time Sequence Modeling NeurIPS 2024 + + +
+ Linear attention Transformers and their gated variants, celebrated for +enabling parallel training and efficient recurrent inference, still fall short +in recall-intensive tasks compared to traditional Transformers and demand +significant resources for training from scratch. This paper introduces Gated +Slot Attention (GSA), which enhances Attention with Bounded-memory-Control +(ABC) by incorporating a gating mechanism inspired by Gated Linear Attention +(GLA). Essentially, GSA comprises a two-layer GLA linked via +$\operatorname{softmax}$, utilizing context-aware memory reading and adaptive +forgetting to improve memory capacity while maintaining compact recurrent state +size. This design greatly enhances both training and inference efficiency +through GLA's hardware-efficient training algorithm and reduced state size. +Additionally, retaining the $\operatorname{softmax}$ operation is particularly +beneficial in "finetuning pretrained Transformers to RNNs" (T2R) settings, +reducing the need for extensive training from scratch. Extensive experiments +confirm GSA's superior performance in scenarios requiring in-context recall and +in T2R settings. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Unlearning as multi-task optimization: A normalized gradient difference + approach with an adaptive learning rate + + +
+ Machine unlearning has been used to remove unwanted knowledge acquired by +large language models (LLMs). In this paper, we examine machine unlearning from +an optimization perspective, framing it as a regularized multi-task +optimization problem, where one task optimizes a forgetting objective and +another optimizes the model performance. In particular, we introduce a +normalized gradient difference (NGDiff) algorithm, enabling us to have better +control over the trade-off between the objectives, while integrating a new, +automatic learning rate scheduler. We provide a theoretical analysis and +empirically demonstrate the superior performance of NGDiff among +state-of-the-art unlearning methods on the TOFU and MUSE datasets while +exhibiting stable training. + +
+
+
+
+
+ + ♻ ☆ ProgressGym: Alignment with a Millennium of Moral Progress NeurIPS 2024 + + +
+ Frontier AI systems, including large language models (LLMs), hold increasing +influence over the epistemology of human users. Such influence can reinforce +prevailing societal values, potentially contributing to the lock-in of +misguided moral beliefs and, consequently, the perpetuation of problematic +moral practices on a broad scale. We introduce progress alignment as a +technical solution to mitigate this imminent risk. Progress alignment +algorithms learn to emulate the mechanics of human moral progress, thereby +addressing the susceptibility of existing alignment methods to contemporary +moral blindspots. To empower research in progress alignment, we introduce +ProgressGym, an experimental framework allowing the learning of moral progress +mechanics from history, in order to facilitate future progress in real-world +moral decisions. Leveraging 9 centuries of historical text and 18 historical +LLMs, ProgressGym enables codification of real-world progress alignment +challenges into concrete benchmarks. Specifically, we introduce three core +challenges: tracking evolving values (PG-Follow), preemptively anticipating +moral progress (PG-Predict), and regulating the feedback loop between human and +AI value shifts (PG-Coevolve). Alignment methods without a temporal dimension +are inapplicable to these tasks. In response, we present lifelong and +extrapolative algorithms as baseline methods of progress alignment, and build +an open leaderboard soliciting novel algorithms and challenges. The framework +and the leaderboard are available at +https://github.com/PKU-Alignment/ProgressGym and +https://huggingface.co/spaces/PKU-Alignment/ProgressGym-LeaderBoard +respectively. + +
+
+ comment: NeurIPS 2024 Track on Datasets and Benchmarks (Spotlight) +
+
+
+
+
+ + ♻ ☆ Enhancing LLM's Cognition via Structurization NeurIPS 2024 + + +
+ When reading long-form text, human cognition is complex and structurized. +While large language models (LLMs) process input contexts through a causal and +sequential perspective, this approach can potentially limit their ability to +handle intricate and complex inputs effectively. To enhance LLM's cognition +capability, this paper presents a novel concept of context structurization. +Specifically, we transform the plain, unordered contextual sentences into +well-ordered and hierarchically structurized elements. By doing so, LLMs can +better grasp intricate and extended contexts through precise attention and +information-seeking along the organized structures. Extensive evaluations are +conducted across various model architectures and sizes (including a series of +auto-regressive LLMs as well as BERT-like masking models) on a diverse set of +NLP tasks (e.g., context-based question-answering, exhaustive hallucination +evaluation, and passage-level dense retrieval). Empirical results show +consistent and significant performance gains afforded by a single-round +structurization. In particular, we boost the open-sourced LLaMA2-70B model to +achieve comparable performance against GPT-3.5-Turbo as the hallucination +evaluator. Besides, we show the feasibility of distilling advanced LLMs' +language processing abilities to a smaller yet effective StruXGPT-7B to execute +structurization, addressing the practicality of our approach. Code is available +at https://github.com/alibaba/struxgpt. + +
+
+ comment: This paper has been accepted by NeurIPS 2024. Code is available at + https://github.com/alibaba/struxgpt +
+
+
+
+
+ + ♻ ☆ On the Proper Treatment of Tokenization in Psycholinguistics EMNLP 2024 + + +
+ Language models are widely used in computational psycholinguistics to test +theories that relate the negative log probability (the surprisal) of a region +of interest (a substring of characters) under a language model to its cognitive +cost experienced by readers, as operationalized, for example, by gaze duration +on the region. However, the application of modern language models to +psycholinguistic studies is complicated by the practice of using tokenization +as an intermediate step in training a model. Doing so results in a language +model over token strings rather than one over character strings. Vexingly, +regions of interest are generally misaligned with these token strings. The +paper argues that token-level language models should be (approximately) +marginalized into character-level language models before they are used in +psycholinguistic studies to compute the surprisal of a region of interest; +then, the marginalized character-level language model can be used to compute +the surprisal of an arbitrary character substring, which we term a focal area, +that the experimenter may wish to use as a predictor. Our proposal of +marginalizing a token-level model into a character-level one solves this +misalignment issue independently of the tokenization scheme. Empirically, we +discover various focal areas whose surprisal is a better psychometric predictor +than the surprisal of the region of interest itself. + +
+
+ comment: Main conference long paper at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ From Text to Emoji: How PEFT-Driven Personality Manipulation Unleashes + the Emoji Potential in LLMs NeurIPS 2024 + + +
+ As the demand for human-like interactions with LLMs continues to grow, so +does the interest in manipulating their personality traits, which has emerged +as a key area of research. Methods like prompt-based In-Context Knowledge +Editing (IKE) and gradient-based Model Editor Networks (MEND) have been +explored but show irregularity and variability. IKE depends on the prompt, +leading to variability and sensitivity, while MEND yields inconsistent and +gibberish outputs. To address this, we employed Opinion QA Based +Parameter-Efficient Fine-Tuning (PEFT), specifically Quantized Low-Rank +Adaptation (QLoRA), to manipulate the Big Five personality traits: Openness, +Conscientiousness, Extraversion, Agreeableness, and Neuroticism. After PEFT, +models such as Mistral-7B-Instruct and Llama-2-7B-chat began generating emojis, +despite their absence in the PEFT data. For instance, Llama-2-7B-chat generated +emojis in 99.5\% of extraversion-related test instances, while +Mistral-7B-Instruct did so in 92.5\% of openness-related test instances. +Explainability analysis indicated that the LLMs used emojis intentionally to +express these traits. This paper provides a number of novel contributions. +First, introducing an Opinion QA dataset for PEFT-driven personality +manipulation; second, developing metric models to benchmark LLM personality +traits; third, demonstrating PEFT's superiority over IKE in personality +manipulation; and finally, analysing and validating emoji usage through +explainability methods such as mechanistic interpretability and in-context +learning explainability methods. + +
+
+ comment: NeurIPS 2024 Workshop on Behavioral Machine Learning +
+
+
+
+
+ + ♻ ☆ GITA: Graph to Visual and Textual Integration for Vision-Language Graph + Reasoning NeurIPS 2024 + + +
+ Large Language Models (LLMs) are increasingly used for various tasks with +graph structures. Though LLMs can process graph information in a textual +format, they overlook the rich vision modality, which is an intuitive way for +humans to comprehend structural information and conduct general graph +reasoning. The potential benefits and capabilities of representing graph +structures as visual images (i.e., $\textit{visual graph}$) are still +unexplored. To fill the gap, we innovatively propose an end-to-end framework, +called $\textbf{G}$raph to v$\textbf{I}$sual and $\textbf{T}$extual +Integr$\textbf{A}$tion (GITA), which firstly incorporates visual graphs into +general graph reasoning. Besides, we establish $\textbf{G}$raph-based +$\textbf{V}$ision-$\textbf{L}$anguage $\textbf{Q}$uestion $\textbf{A}$nswering +(GVLQA) dataset from existing graph data, which is the first vision-language +dataset for general graph reasoning purposes. Extensive experiments on the +GVLQA dataset and five real-world datasets show that GITA outperforms +mainstream LLMs in terms of general graph reasoning capabilities. Moreover, We +highlight the effectiveness of the layout augmentation on visual graphs and +pretraining on the GVLQA dataset. + +
+
+ comment: NeurIPS 2024; Project Page: v-graph.github.io; Code: + https://github.com/WEIYanbin1999/GITA/ +
+
+
+
+
+ + ♻ ☆ Fight Back Against Jailbreaking via Prompt Adversarial Tuning + + +
+ While Large Language Models (LLMs) have achieved tremendous success in +various applications, they are also susceptible to jailbreaking attacks. +Several primary defense strategies have been proposed to protect LLMs from +producing harmful information, mostly focusing on model fine-tuning or +heuristical defense designs. However, how to achieve intrinsic robustness +through prompt optimization remains an open problem. In this paper, motivated +by adversarial training paradigms for achieving reliable robustness, we propose +an approach named Prompt Adversarial Tuning (PAT) that trains a prompt control +attached to the user prompt as a guard prefix. To achieve our defense goal +whilst maintaining natural performance, we optimize the control prompt with +both adversarial and benign prompts. Comprehensive experiments show that our +method is effective against both grey-box and black-box attacks, reducing the +success rate of advanced attacks to nearly 0%, while maintaining the model's +utility on the benign task and incurring only negligible computational +overhead, charting a new perspective for future explorations in LLM security. +Our code is available at https://github.com/PKU-ML/PAT. + +
+
+
+
+
+ + ♻ ☆ The FineWeb Datasets: Decanting the Web for the Finest Text Data at + Scale + + +
+ The performance of a large language model (LLM) depends heavily on the +quality and size of its pretraining dataset. However, the pretraining datasets +for state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly +available and very little is known about how they were created. In this work, +we introduce FineWeb, a 15-trillion token dataset derived from 96 Common Crawl +snapshots that produces better-performing LLMs than other open pretraining +datasets. To advance the understanding of how best to curate high-quality +pretraining datasets, we carefully document and ablate all of the design +choices used in FineWeb, including in-depth investigations of deduplication and +filtering strategies. In addition, we introduce FineWeb-Edu, a 1.3-trillion +token collection of educational text filtered from FineWeb. LLMs pretrained on +FineWeb-Edu exhibit dramatically better performance on knowledge- and +reasoning-intensive benchmarks like MMLU and ARC. Along with our datasets, we +publicly release our data curation codebase and all of the models trained +during our ablation experiments. + +
+
+
+
+
+ + ♻ ☆ AutoTimes: Autoregressive Time Series Forecasters via Large Language + Models + + +
+ Foundation models of time series have not been fully developed due to the +limited availability of time series corpora and the underexploration of +scalable pre-training. Based on the similar sequential formulation of time +series and natural language, increasing research demonstrates the feasibility +of leveraging large language models (LLM) for time series. Nevertheless, the +inherent autoregressive property and decoder-only architecture of LLMs have not +been fully considered, resulting in insufficient utilization of LLM abilities. +To fully revitalize the general-purpose token transition and multi-step +generation capability of large language models, we propose AutoTimes to +repurpose LLMs as autoregressive time series forecasters, which projects time +series into the embedding space of language tokens and autoregressively +generates future predictions with arbitrary lengths. Compatible with any +decoder-only LLMs, the consequent forecaster exhibits the flexibility of the +lookback length and scalability with larger LLMs. Further, we formulate time +series as prompts, extending the context for prediction beyond the lookback +window, termed in-context forecasting. By introducing LLM-embedded textual +timestamps, AutoTimes can utilize chronological information to align +multivariate time series. Empirically, AutoTimes achieves state-of-the-art with +0.1% trainable parameters and over $5\times$ training/inference speedup +compared to advanced LLM-based forecasters. Code is available at this +repository: https://github.com/thuml/AutoTimes. + +
+
+
+
+
+ + ♻ ☆ Mars: Situated Inductive Reasoning in an Open-World Environment NeurIPS 2024 + + +
+ Large Language Models (LLMs) trained on massive corpora have shown remarkable +success in knowledge-intensive tasks. Yet, most of them rely on pre-stored +knowledge. Inducing new general knowledge from a specific environment and +performing reasoning with the acquired knowledge -- \textit{situated inductive +reasoning}, is crucial and challenging for machine intelligence. In this paper, +we design Mars, an interactive environment devised for situated inductive +reasoning. It introduces counter-commonsense game mechanisms by modifying +terrain, survival setting and task dependency while adhering to certain +principles. In Mars, agents need to actively interact with their surroundings, +derive useful rules and perform decision-making tasks in specific contexts. We +conduct experiments on various RL-based and LLM-based methods, finding that +they all struggle on this challenging situated inductive reasoning benchmark. +Furthermore, we explore \textit{Induction from Reflection}, where we instruct +agents to perform inductive reasoning from history trajectory. The superior +performance underscores the importance of inductive reasoning in Mars. Through +Mars, we aim to galvanize advancements in situated inductive reasoning and set +the stage for developing the next generation of AI systems that can reason in +an adaptive and context-sensitive way. + +
+
+ comment: Accepted by NeurIPS 2024 Track Datasets and Benchmarks. Project page: + https://marscrafter.github.io/ +
+
+
+
+
+ + ♻ ☆ Combining LLMs and Knowledge Graphs to Reduce Hallucinations in Question + Answering + + +
+ Advancements in natural language processing have revolutionized the way we +can interact with digital information systems, such as databases, making them +more accessible. However, challenges persist, especially when accuracy is +critical, as in the biomedical domain. A key issue is the hallucination +problem, where models generate information unsupported by the underlying data, +potentially leading to dangerous misinformation. This paper presents a novel +approach designed to bridge this gap by combining Large Language Models (LLM) +and Knowledge Graphs (KG) to improve the accuracy and reliability of +question-answering systems, on the example of a biomedical KG. Built on the +LangChain framework, our method incorporates a query checker that ensures the +syntactical and semantic validity of LLM-generated queries, which are then used +to extract information from a Knowledge Graph, substantially reducing errors +like hallucinations. We evaluated the overall performance using a new benchmark +dataset of 50 biomedical questions, testing several LLMs, including GPT-4 Turbo +and llama3:70b. Our results indicate that while GPT-4 Turbo outperforms other +models in generating accurate queries, open-source models like llama3:70b show +promise with appropriate prompt engineering. To make this approach accessible, +a user-friendly web-based interface has been developed, allowing users to input +natural language queries, view generated and corrected Cypher queries, and +verify the resulting paths for accuracy. Overall, this hybrid approach +effectively addresses common issues such as data gaps and hallucinations, +offering a reliable and intuitive solution for question answering systems. The +source code for generating the results of this paper and for the user-interface +can be found in our Git repository: https://git.zib.de/lpusch/cyphergenkg-gui + +
+
+
+
+
+ + ♻ ☆ AvaTaR: Optimizing LLM Agents for Tool Usage via Contrastive Reasoning NeurIPS 2024 + + +
+ Large language model (LLM) agents have demonstrated impressive capabilities +in utilizing external tools and knowledge to boost accuracy and reduce +hallucinations. However, developing prompting techniques that enable LLM agents +to effectively use these tools and knowledge remains a heuristic and +labor-intensive task. Here, we introduce AvaTaR, a novel and automated +framework that optimizes an LLM agent to effectively leverage provided tools, +improving performance on a given task. During optimization, we design a +comparator module to iteratively deliver insightful and comprehensive prompts +to the LLM agent by contrastively reasoning between positive and negative +examples sampled from training data. We demonstrate AvaTaR on four complex +multimodal retrieval datasets featuring textual, visual, and relational +information, and three general question-answering (QA) datasets. We find AvaTaR +consistently outperforms state-of-the-art approaches across all seven tasks, +exhibiting strong generalization ability when applied to novel cases and +achieving an average relative improvement of 14% on the Hit@1 metric for the +retrieval datasets and 13% for the QA datasets. Code and dataset are available +at https://github.com/zou-group/avatar. + +
+
+ comment: NeurIPS 2024 main conference +
+
+
+
+
+ + ♻ ☆ LINGOLY: A Benchmark of Olympiad-Level Linguistic Reasoning Puzzles in + Low-Resource and Extinct Languages NeurIPS 2024 + + +
+ In this paper, we present the LingOly benchmark, a novel benchmark for +advanced reasoning abilities in large language models. Using challenging +Linguistic Olympiad puzzles, we evaluate (i) capabilities for in-context +identification and generalisation of linguistic patterns in very low-resource +or extinct languages, and (ii) abilities to follow complex task instructions. +The LingOly benchmark covers more than 90 mostly low-resource languages, +minimising issues of data contamination, and contains 1,133 problems across 6 +formats and 5 levels of human difficulty. We assess performance with both +direct accuracy and comparison to a no-context baseline to penalise +memorisation. Scores from 11 state-of-the-art LLMs demonstrate the benchmark to +be challenging, and models perform poorly on the higher difficulty problems. On +harder problems, even the top model only achieved 38.7% accuracy, a 24.7% +improvement over the no-context baseline. Large closed models typically +outperform open models, and in general, the higher resource the language, the +better the scores. These results indicate, in absence of memorisation, true +multi-step out-of-domain reasoning remains a challenge for current language +models. + +
+
+ comment: Oral presentation at NeurIPS 2024 Datasets and Benchmarks Track. 10 + pages, 5 figures, 22 pages supplemental materials +
+
+
+
+
+ + ♻ ☆ ERBench: An Entity-Relationship based Automatically Verifiable + Hallucination Benchmark for Large Language Models + + +
+ Large language models (LLMs) have achieved unprecedented performances in +various applications, yet evaluating them is still challenging. Existing +benchmarks are either manually constructed or are automatic, but lack the +ability to evaluate the thought process of LLMs with arbitrary complexity. We +contend that utilizing existing relational databases based on the +entity-relationship (ER) model is a promising approach for constructing +benchmarks as they contain structured knowledge that can be used to question +LLMs. Unlike knowledge graphs, which are also used to evaluate LLMs, relational +databases have integrity constraints that can be used to better construct +complex in-depth questions and verify answers: (1) functional dependencies can +be used to pinpoint critical keywords that an LLM must know to properly answer +a given question containing certain attribute values; and (2) foreign key +constraints can be used to join relations and construct multi-hop questions, +which can be arbitrarily long and used to debug intermediate answers. We thus +propose ERBench, which uses these integrity constraints to convert any database +into an LLM benchmark. ERBench supports continuous evaluation as databases +change, multimodal questions, and various prompt engineering techniques. In our +experiments, we construct LLM benchmarks using databases of multiple domains +and make an extensive comparison of contemporary LLMs. We show how ERBench can +properly evaluate any LLM by not only checking for answer correctness, but also +effectively verifying the rationales by looking for the right keywords. + +
+
+
+
+
+ + ♻ ☆ Attend First, Consolidate Later: On the Importance of Attention in + Different LLM Layers + + +
+ In decoder-based LLMs, the representation of a given layer serves two +purposes: as input to the next layer during the computation of the current +token; and as input to the attention mechanism of future tokens. In this work, +we show that the importance of the latter role might be overestimated. To show +that, we start by manipulating the representations of previous tokens; e.g. by +replacing the hidden states at some layer k with random vectors. Our +experimenting with four LLMs and four tasks show that this operation often +leads to small to negligible drop in performance. Importantly, this happens if +the manipulation occurs in the top part of the model-k is in the final 30-50% +of the layers. In contrast, doing the same manipulation in earlier layers might +lead to chance level performance. We continue by switching the hidden state of +certain tokens with hidden states of other tokens from another prompt; e.g., +replacing the word "Italy" with "France" in "What is the capital of Italy?". We +find that when applying this switch in the top 1/3 of the model, the model +ignores it (answering "Rome"). However if we apply it before, the model +conforms to the switch ("Paris"). Our results hint at a two stage process in +transformer-based LLMs: the first part gathers input from previous tokens, +while the second mainly processes that information internally. + +
+
+
+
+
+ + ♻ ☆ Fact Recall, Heuristics or Pure Guesswork? Precise Interpretations of + Language Models for Fact Completion + + +
+ Previous interpretations of language models (LMs) miss important distinctions +in how these models process factual information. For example, given the query +"Astrid Lindgren was born in" with the corresponding completion "Sweden", no +difference is made between whether the prediction was based on having the exact +knowledge of the birthplace of the Swedish author or assuming that a person +with a Swedish-sounding name was born in Sweden. In this paper, we investigate +four different prediction scenarios for which the LM can be expected to show +distinct behaviors. These scenarios correspond to different levels of model +reliability and types of information being processed - some being less +desirable for factual predictions. To facilitate precise interpretations of LMs +for fact completion, we propose a model-specific recipe called PrISM for +constructing datasets with examples of each scenario based on a set of +diagnostic criteria. We apply a popular interpretability method, causal tracing +(CT), to the four prediction scenarios and find that while CT produces +different results for each scenario, aggregations over a set of mixed examples +may only represent the results from the scenario with the strongest measured +signal. In summary, we contribute tools for a more granular study of fact +completion in language models and analyses that provide a more nuanced +understanding of how LMs process fact-related queries. + +
+
+
+
+
+ + ♻ ☆ Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback + + +
+ RAG systems face limitations when semantic relevance alone does not guarantee +improved generation quality. This issue becomes particularly evident due to the +sensitivity of large language models (LLMs) to the ordering of few-shot +prompts, which can affect model performance. To address this challenge, +aligning LLM outputs with human preferences using structured feedback, such as +options to copy, regenerate, or dislike, offers a promising method for +improvement. This feedback is applied to the entire list of inputs rather than +giving specific ratings for individual documents, making it a Listwide Labels +Learning-to-Rank task. + To address this task, we propose Pistis-RAG, a new RAG framework designed +with a content-centric approach to better align LLMs with human preferences. +Pistis-RAG effectively utilizes human feedback, enhancing content ranking and +generation quality. To validate our framework, we use public datasets to +simulate human feedback, allowing us to evaluate and refine our method +effectively. Experimental results indicate that Pistis-RAG improves alignment +with human preferences relative to the baseline RAG system, showing a 6.06% +increase in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy +metrics. These results highlight Pistis-RAG's effectiveness in overcoming the +limitations associated with traditional RAG approaches. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Complex Instruction-Following with Multiple Constraints + Composition NeurIPS 2024 + + +
+ Instruction following is one of the fundamental capabilities of large +language models (LLMs). As the ability of LLMs is constantly improving, they +have been increasingly applied to deal with complex human instructions in +real-world scenarios. Therefore, how to evaluate the ability of complex +instruction-following of LLMs has become a critical research problem. Existing +benchmarks mainly focus on modeling different types of constraints in human +instructions while neglecting the composition of different constraints, which +is an indispensable constituent in complex instructions. To this end, we +propose ComplexBench, a benchmark for comprehensively evaluating the ability of +LLMs to follow complex instructions composed of multiple constraints. We +propose a hierarchical taxonomy for complex instructions, including 4 +constraint types, 19 constraint dimensions, and 4 composition types, and +manually collect a high-quality dataset accordingly. To make the evaluation +reliable, we augment LLM-based evaluators with rules to effectively verify +whether generated texts can satisfy each constraint and composition. +Furthermore, we obtain the final evaluation score based on the dependency +structure determined by different composition types. ComplexBench identifies +significant deficiencies in existing LLMs when dealing with complex +instructions with multiple constraints composition. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ Large Language Model Unlearning via Embedding-Corrupted Prompts NeurIPS 2024 + + +
+ Large language models (LLMs) have advanced to encompass extensive knowledge +across diverse domains. Yet controlling what a large language model should not +know is important for ensuring alignment and thus safe use. However, accurately +and efficiently unlearning knowledge from an LLM remains challenging due to the +potential collateral damage caused by the fuzzy boundary between retention and +forgetting, and the large computational requirements for optimization across +state-of-the-art models with hundreds of billions of parameters. In this work, +we present \textbf{Embedding-COrrupted (ECO) Prompts}, a lightweight unlearning +framework for large language models to address both the challenges of knowledge +entanglement and unlearning efficiency. Instead of relying on the LLM itself to +unlearn, we enforce an unlearned state during inference by employing a prompt +classifier to identify and safeguard prompts to forget. We learn corruptions +added to prompt embeddings via zeroth order optimization toward the unlearning +objective offline and corrupt prompts flagged by the classifier during +inference. We find that these embedding-corrupted prompts not only lead to +desirable outputs that satisfy the unlearning objective but also closely +approximate the output from a model that has never been trained on the data +intended for forgetting. Through extensive experiments on unlearning, we +demonstrate the superiority of our method in achieving promising unlearning at +\textit{nearly zero side effects} in general domains and domains closely +related to the unlearned ones. Additionally, we highlight the scalability of +our method to 100 LLMs, ranging from 0.5B to 236B parameters, incurring no +additional cost as the number of parameters increases. We have made our code +publicly available at \url{https://github.com/chrisliu298/llm-unlearn-eco}. + +
+
+ comment: NeurIPS 2024 Poster +
+
+
+
+
+ + ♻ ☆ Can Language Models Replace Programmers? REPOCOD Says 'Not Yet' + + +
+ Large language models (LLMs) have achieved high accuracy, i.e., more than 90 +pass@1, in solving Python coding problems in HumanEval and MBPP. Thus, a +natural question is, whether LLMs achieve comparable code completion +performance compared to human developers? Unfortunately, one cannot answer this +question using existing manual crafted or simple (e.g., single-line) code +generation benchmarks, since such tasks fail to represent real-world software +development tasks. In addition, existing benchmarks often use poor code +correctness metrics, providing misleading conclusions. + To address these challenges, we create REPOCOD, a code generation benchmark +with 980 problems collected from 11 popular real-world projects, with more than +58% of them requiring file-level or repository-level context information. In +addition, REPOCOD has the longest average canonical solution length (331.6 +tokens) and the highest average cyclomatic complexity (9.00) compared to +existing benchmarks. Each task in REPOCOD includes 313.5 developerwritten test +cases on average for better correctness evaluation. In our evaluations of ten +LLMs, none of the models achieve more than 30 pass@1 on REPOCOD, indicating the +necessity of building stronger LLMs that can help developers in real-world +software development. REPOCOD is available at +https://github.com/ltasset/REPOCOD + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Generative Information Extraction: A Survey + + +
+ Information extraction (IE) aims to extract structural knowledge from plain +natural language texts. Recently, generative Large Language Models (LLMs) have +demonstrated remarkable capabilities in text understanding and generation. As a +result, numerous works have been proposed to integrate LLMs for IE tasks based +on a generative paradigm. To conduct a comprehensive systematic review and +exploration of LLM efforts for IE tasks, in this study, we survey the most +recent advancements in this field. We first present an extensive overview by +categorizing these works in terms of various IE subtasks and techniques, and +then we empirically analyze the most advanced methods and discover the emerging +trend of IE tasks with LLMs. Based on a thorough review conducted, we identify +several insights in technique and promising research directions that deserve +further exploration in future studies. We maintain a public repository and +consistently update related works and resources on GitHub +(\href{https://github.com/quqxui/Awesome-LLM4IE-Papers}{LLM4IE repository}) + +
+
+ comment: The article has been accepted by Frontiers of Computer Science (FCS), + with the DOI: {10.1007/s11704-024-40555-y}. You can cite the FCS version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 104 + +
+
+
+ + ☆ URAvatar: Universal Relightable Gaussian Codec Avatars SIGGRAPH + + +
+ We present a new approach to creating photorealistic and relightable head +avatars from a phone scan with unknown illumination. The reconstructed avatars +can be animated and relit in real time with the global illumination of diverse +environments. Unlike existing approaches that estimate parametric reflectance +parameters via inverse rendering, our approach directly models learnable +radiance transfer that incorporates global light transport in an efficient +manner for real-time rendering. However, learning such a complex light +transport that can generalize across identities is non-trivial. A phone scan in +a single environment lacks sufficient information to infer how the head would +appear in general environments. To address this, we build a universal +relightable avatar model represented by 3D Gaussians. We train on hundreds of +high-quality multi-view human scans with controllable point lights. +High-resolution geometric guidance further enhances the reconstruction accuracy +and generalization. Once trained, we finetune the pretrained model on a phone +scan using inverse rendering to obtain a personalized relightable avatar. Our +experiments establish the efficacy of our design, outperforming existing +approaches while retaining real-time rendering capability. + +
+
+ comment: SIGGRAPH Asia 2024. Website: + https://junxuan-li.github.io/urgca-website/ +
+
+
+
+
+ + ☆ EgoMimic: Scaling Imitation Learning via Egocentric Video + + +
+ The scale and diversity of demonstration data required for imitation learning +is a significant challenge. We present EgoMimic, a full-stack framework which +scales manipulation via human embodiment data, specifically egocentric human +videos paired with 3D hand tracking. EgoMimic achieves this through: (1) a +system to capture human embodiment data using the ergonomic Project Aria +glasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap +to human data, (3) cross-domain data alignment techniques, and (4) an imitation +learning architecture that co-trains on human and robot data. Compared to prior +works that only extract high-level intent from human videos, our approach +treats human and robot data equally as embodied demonstration data and learns a +unified policy from both data sources. EgoMimic achieves significant +improvement on a diverse set of long-horizon, single-arm and bimanual +manipulation tasks over state-of-the-art imitation learning methods and enables +generalization to entirely new scenes. Finally, we show a favorable scaling +trend for EgoMimic, where adding 1 hour of additional hand data is +significantly more valuable than 1 hour of additional robot data. Videos and +additional information can be found at https://egomimic.github.io/ + +
+
+
+
+
+ + ☆ Enhancing Motion in Text-to-Video Generation with Decomposed Encoding + and Conditioning NeurIPS 2024 + + +
+ Despite advancements in Text-to-Video (T2V) generation, producing videos with +realistic motion remains challenging. Current models often yield static or +minimally dynamic outputs, failing to capture complex motions described by +text. This issue stems from the internal biases in text encoding, which +overlooks motions, and inadequate conditioning mechanisms in T2V generation +models. To address this, we propose a novel framework called DEcomposed MOtion +(DEMO), which enhances motion synthesis in T2V generation by decomposing both +text encoding and conditioning into content and motion components. Our method +includes a content encoder for static elements and a motion encoder for +temporal dynamics, alongside separate content and motion conditioning +mechanisms. Crucially, we introduce text-motion and video-motion supervision to +improve the model's understanding and generation of motion. Evaluations on +benchmarks such as MSR-VTT, UCF-101, WebVid-10M, EvalCrafter, and VBench +demonstrate DEMO's superior ability to produce videos with enhanced motion +dynamics while maintaining high visual quality. Our approach significantly +advances T2V generation by integrating comprehensive motion understanding +directly from textual descriptions. Project page: +https://PR-Ryan.github.io/DEMO-project/ + +
+
+ comment: Accepted at NeurIPS 2024, code available at + https://github.com/PR-Ryan/DEMO +
+
+
+
+
+ + ☆ Teaching Embodied Reinforcement Learning Agents: Informativeness and + Diversity of Language Use EMNLP 2024 + + +
+ In real-world scenarios, it is desirable for embodied agents to have the +ability to leverage human language to gain explicit or implicit knowledge for +learning tasks. Despite recent progress, most previous approaches adopt simple +low-level instructions as language inputs, which may not reflect natural human +communication. It's not clear how to incorporate rich language use to +facilitate task learning. To address this question, this paper studies +different types of language inputs in facilitating reinforcement learning (RL) +embodied agents. More specifically, we examine how different levels of language +informativeness (i.e., feedback on past behaviors and future guidance) and +diversity (i.e., variation of language expressions) impact agent learning and +inference. Our empirical results based on four RL benchmarks demonstrate that +agents trained with diverse and informative language feedback can achieve +enhanced generalization and fast adaptation to new tasks. These findings +highlight the pivotal role of language use in teaching embodied agents new +tasks in an open world. Project website: +https://github.com/sled-group/Teachable_RL + +
+
+ comment: EMNLP 2024 Main. Project website: + https://github.com/sled-group/Teachable_RL +
+
+
+
+
+ + ☆ ARQ: A Mixed-Precision Quantization Framework for Accurate and + Certifiably Robust DNNs + + +
+ Mixed precision quantization has become an important technique for enabling +the execution of deep neural networks (DNNs) on limited resource computing +platforms. Traditional quantization methods have primarily concentrated on +maintaining neural network accuracy, either ignoring the impact of quantization +on the robustness of the network, or using only empirical techniques for +improving robustness. In contrast, techniques for robustness certification, +which can provide strong guarantees about the robustness of DNNs have not been +used during quantization due to their high computation cost. + This paper introduces ARQ, an innovative mixed-precision quantization method +that not only preserves the clean accuracy of the smoothed classifiers but also +maintains their certified robustness. ARQ uses reinforcement learning to find +accurate and robust DNN quantization, while efficiently leveraging randomized +smoothing, a popular class of statistical DNN verification algorithms, to guide +the search process. + We compare ARQ with multiple state-of-the-art quantization techniques on +several DNN architectures commonly used in quantization studies: ResNet-20 on +CIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate +that ARQ consistently performs better than these baselines across all the +benchmarks and the input perturbation levels. In many cases, the performance of +ARQ quantized networks can reach that of the original DNN with floating-point +weights, but with only 1.5% instructions. + +
+
+
+
+
+ + ☆ Learning Video Representations without Natural Videos + + +
+ In this paper, we show that useful video representations can be learned from +synthetic videos and natural images, without incorporating natural videos in +the training. We propose a progression of video datasets synthesized by simple +generative processes, that model a growing set of natural video properties +(e.g. motion, acceleration, and shape transformations). The downstream +performance of video models pre-trained on these generated datasets gradually +increases with the dataset progression. A VideoMAE model pre-trained on our +synthetic videos closes 97.2% of the performance gap on UCF101 action +classification between training from scratch and self-supervised pre-training +from natural videos, and outperforms the pre-trained model on HMDB51. +Introducing crops of static images to the pre-training stage results in similar +performance to UCF101 pre-training and outperforms the UCF101 pre-trained model +on 11 out of 14 out-of-distribution datasets of UCF101-P. Analyzing the +low-level properties of the datasets, we identify correlations between frame +diversity, frame similarity to natural data, and downstream performance. Our +approach provides a more controllable and transparent alternative to video data +curation processes for pre-training. + +
+
+ comment: Project page: https://unicorn53547.github.io/video_syn_rep/ +
+
+
+
+
+ + ☆ No Pose, No Problem: Surprisingly Simple 3D Gaussian Splats from Sparse + Unposed Images + + +
+ We introduce NoPoSplat, a feed-forward model capable of reconstructing 3D +scenes parameterized by 3D Gaussians from \textit{unposed} sparse multi-view +images. Our model, trained exclusively with photometric loss, achieves +real-time 3D Gaussian reconstruction during inference. To eliminate the need +for accurate pose input during reconstruction, we anchor one input view's local +camera coordinates as the canonical space and train the network to predict +Gaussian primitives for all views within this space. This approach obviates the +need to transform Gaussian primitives from local coordinates into a global +coordinate system, thus avoiding errors associated with per-frame Gaussians and +pose estimation. To resolve scale ambiguity, we design and compare various +intrinsic embedding methods, ultimately opting to convert camera intrinsics +into a token embedding and concatenate it with image tokens as input to the +model, enabling accurate scene scale prediction. We utilize the reconstructed +3D Gaussians for novel view synthesis and pose estimation tasks and propose a +two-stage coarse-to-fine pipeline for accurate pose estimation. Experimental +results demonstrate that our pose-free approach can achieve superior novel view +synthesis quality compared to pose-required methods, particularly in scenarios +with limited input image overlap. For pose estimation, our method, trained +without ground truth depth or explicit matching loss, significantly outperforms +the state-of-the-art methods with substantial improvements. This work makes +significant advances in pose-free generalizable 3D reconstruction and +demonstrates its applicability to real-world scenarios. Code and trained models +are available at https://noposplat.github.io/. + +
+
+ comment: Project page: https://noposplat.github.io/ +
+
+
+
+
+ + ☆ DiffPano: Scalable and Consistent Text to Panorama Generation with + Spherical Epipolar-Aware Diffusion NeurIPS2024 + + +
+ Diffusion-based methods have achieved remarkable achievements in 2D image or +3D object generation, however, the generation of 3D scenes and even +$360^{\circ}$ images remains constrained, due to the limited number of scene +datasets, the complexity of 3D scenes themselves, and the difficulty of +generating consistent multi-view images. To address these issues, we first +establish a large-scale panoramic video-text dataset containing millions of +consecutive panoramic keyframes with corresponding panoramic depths, camera +poses, and text descriptions. Then, we propose a novel text-driven panoramic +generation framework, termed DiffPano, to achieve scalable, consistent, and +diverse panoramic scene generation. Specifically, benefiting from the powerful +generative capabilities of stable diffusion, we fine-tune a single-view +text-to-panorama diffusion model with LoRA on the established panoramic +video-text dataset. We further design a spherical epipolar-aware multi-view +diffusion model to ensure the multi-view consistency of the generated panoramic +images. Extensive experiments demonstrate that DiffPano can generate scalable, +consistent, and diverse panoramic images with given unseen text descriptions +and camera poses. + +
+
+ comment: NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code: + https://github.com/zju3dv/DiffPano +
+
+
+
+
+ + ☆ Chasing Better Deep Image Priors between Over- and + Under-parameterization + + +
+ Deep Neural Networks (DNNs) are well-known to act as over-parameterized deep +image priors (DIP) that regularize various image inverse problems. Meanwhile, +researchers also proposed extremely compact, under-parameterized image priors +(e.g., deep decoder) that are strikingly competent for image restoration too, +despite a loss of accuracy. These two extremes push us to think whether there +exists a better solution in the middle: between over- and under-parameterized +image priors, can one identify "intermediate" parameterized image priors that +achieve better trade-offs between performance, efficiency, and even preserving +strong transferability? Drawing inspirations from the lottery ticket hypothesis +(LTH), we conjecture and study a novel "lottery image prior" (LIP) by +exploiting DNN inherent sparsity, stated as: given an over-parameterized +DNN-based image prior, it will contain a sparse subnetwork that can be trained +in isolation, to match the original DNN's performance when being applied as a +prior to various image inverse problems. Our results validate the superiority +of LIPs: we can successfully locate the LIP subnetworks from over-parameterized +DIPs at substantial sparsity ranges. Those LIP subnetworks significantly +outperform deep decoders under comparably compact model sizes (by often fully +preserving the effectiveness of their over-parameterized counterparts), and +they also possess high transferability across different images as well as +restoration task types. Besides, we also extend LIP to compressive sensing +image reconstruction, where a pre-trained GAN generator is used as the prior +(in contrast to untrained DIP or deep decoder), and confirm its validity in +this setting too. To our best knowledge, this is the first time that LTH is +demonstrated to be relevant in the context of inverse problems or image priors. + +
+
+ comment: Codes are available at + https://github.com/VITA-Group/Chasing-Better-DIPs +
+
+
+
+
+ + ☆ DexMimicGen: Automated Data Generation for Bimanual Dexterous + Manipulation via Imitation Learning + + +
+ Imitation learning from human demonstrations is an effective means to teach +robots manipulation skills. But data acquisition is a major bottleneck in +applying this paradigm more broadly, due to the amount of cost and human effort +involved. There has been significant interest in imitation learning for +bimanual dexterous robots, like humanoids. Unfortunately, data collection is +even more challenging here due to the challenges of simultaneously controlling +multiple arms and multi-fingered hands. Automated data generation in simulation +is a compelling, scalable alternative to fuel this need for data. To this end, +we introduce DexMimicGen, a large-scale automated data generation system that +synthesizes trajectories from a handful of human demonstrations for humanoid +robots with dexterous hands. We present a collection of simulation environments +in the setting of bimanual dexterous manipulation, spanning a range of +manipulation behaviors and different requirements for coordination among the +two arms. We generate 21K demos across these tasks from just 60 source human +demos and study the effect of several data generation and policy learning +decisions on agent performance. Finally, we present a real-to-sim-to-real +pipeline and deploy it on a real-world humanoid can sorting task. Videos and +more are at https://dexmimicgen.github.io/ + +
+
+ comment: Project website: https://dexmimicgen.github.io/ +
+
+
+
+
+ + ☆ Extended Object Tracking and Classification based on Linear Splines + + +
+ This paper introduces a framework based on linear splines for 2-dimensional +extended object tracking and classification. Unlike state of the art models, +linear splines allow to represent extended objects whose contour is an +arbitrarily complex curve. An exact likelihood is derived for the case in which +noisy measurements can be scattered from any point on the contour of the +extended object, while an approximate Monte Carlo likelihood is provided for +the case wherein scattering points can be anywhere, i.e. inside or on the +contour, on the object surface. Exploiting such likelihood to measure how well +the observed data fit a given shape, a suitable estimator is developed. The +proposed estimator models the extended object in terms of a kinematic state, +providing object position and orientation, along with a shape vector, +characterizing object contour and surface. The kinematic state is estimated via +a nonlinear Kalman filter, while the shape vector is estimated via a Bayesian +classifier so that classification is implicitly solved during shape estimation. +Numerical experiments are provided to assess, compared to state of the art +extended object estimators, the effectiveness of the proposed one. + +
+
+
+
+
+ + ☆ Federated Black-Box Adaptation for Semantic Segmentation + + +
+ Federated Learning (FL) is a form of distributed learning that allows +multiple institutions or clients to collaboratively learn a global model to +solve a task. This allows the model to utilize the information from every +institute while preserving data privacy. However, recent studies show that the +promise of protecting the privacy of data is not upheld by existing methods and +that it is possible to recreate the training data from the different +institutions. This is done by utilizing gradients transferred between the +clients and the global server during training or by knowing the model +architecture at the client end. In this paper, we propose a federated learning +framework for semantic segmentation without knowing the model architecture nor +transferring gradients between the client and the server, thus enabling better +privacy preservation. We propose BlackFed - a black-box adaptation of neural +networks that utilizes zero order optimization (ZOO) to update the client model +weights and first order optimization (FOO) to update the server weights. We +evaluate our approach on several computer vision and medical imaging datasets +to demonstrate its effectiveness. To the best of our knowledge, this work is +one of the first works in employing federated learning for segmentation, devoid +of gradients or model information exchange. Code: +https://github.com/JayParanjape/blackfed/tree/master + +
+
+ comment: Accepted at NEURIPS 2024 +
+
+
+
+
+ + ☆ Redefining in Dictionary: Towards a Enhanced Semantic + Understanding of Creative Generation + + +
+ Creativity, both in human and diffusion models, remains an inherently +abstract concept; thus, simply adding "creative" to a prompt does not yield +reliable semantic recognition by the model. In this work, we concretize the +abstract notion of "creative" through the TP2O task, which aims to merge two +unrelated concepts, and introduce CreTok, redefining "creative" as the token +$\texttt{}$. This redefinition offers a more concrete and universally +adaptable representation for concept blending. This redefinition occurs +continuously, involving the repeated random sampling of text pairs with +different concepts and optimizing cosine similarity between target and constant +prompts. This approach enables $\texttt{}$ to learn a method for +creative concept fusion. Extensive experiments demonstrate that the creative +capability enabled by $\texttt{}$ substantially surpasses recent SOTA +diffusion models and achieves superior creative generation. CreTok exhibits +greater flexibility and reduced time overhead, as $\texttt{}$ can +function as a universal token for any concept, facilitating creative generation +without retraining. + +
+
+
+
+
+ + ☆ Scaling Concept With Text-Guided Diffusion Models + + +
+ Text-guided diffusion models have revolutionized generative tasks by +producing high-fidelity content from text descriptions. They have also enabled +an editing paradigm where concepts can be replaced through text conditioning +(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of +replacing a concept, can we enhance or suppress the concept itself? Through an +empirical study, we identify a trend where concepts can be decomposed in +text-guided diffusion models. Leveraging this insight, we introduce +ScalingConcept, a simple yet effective method to scale decomposed concepts up +or down in real input without introducing new elements. To systematically +evaluate our approach, we present the WeakConcept-10 dataset, where concepts +are imperfect and need to be enhanced. More importantly, ScalingConcept enables +a variety of novel zero-shot applications across image and audio domains, +including tasks such as canonical pose generation and generative sound +highlighting or removal. + +
+
+ comment: Project page: https://wikichao.github.io/ScalingConcept/ +
+
+
+
+
+ + ☆ Exploring Vision Language Models for Facial Attribute Recognition: + Emotion, Race, Gender, and Age + + +
+ Technologies for recognizing facial attributes like race, gender, age, and +emotion have several applications, such as surveillance, advertising content, +sentiment analysis, and the study of demographic trends and social behaviors. +Analyzing demographic characteristics based on images and analyzing facial +expressions have several challenges due to the complexity of humans' facial +attributes. Traditional approaches have employed CNNs and various other deep +learning techniques, trained on extensive collections of labeled images. While +these methods demonstrated effective performance, there remains potential for +further enhancements. In this paper, we propose to utilize vision language +models (VLMs) such as generative pre-trained transformer (GPT), GEMINI, large +language and vision assistant (LLAVA), PaliGemma, and Microsoft Florence2 to +recognize facial attributes such as race, gender, age, and emotion from images +with human faces. Various datasets like FairFace, AffectNet, and UTKFace have +been utilized to evaluate the solutions. The results show that VLMs are +competitive if not superior to traditional techniques. Additionally, we propose +"FaceScanPaliGemma"--a fine-tuned PaliGemma model--for race, gender, age, and +emotion recognition. The results show an accuracy of 81.1%, 95.8%, 80%, and +59.4% for race, gender, age group, and emotion classification, respectively, +outperforming pre-trained version of PaliGemma, other VLMs, and SotA methods. +Finally, we propose "FaceScanGPT", which is a GPT-4o model to recognize the +above attributes when several individuals are present in the image using a +prompt engineered for a person with specific facial and/or physical attributes. +The results underscore the superior multitasking capability of FaceScanGPT to +detect the individual's attributes like hair cut, clothing color, postures, +etc., using only a prompt to drive the detection and recognition tasks. + +
+
+ comment: 52 pages, 13 figures +
+
+
+
+
+ + ☆ HoloChrome: Polychromatic Illumination for Speckle Reduction in + Holographic Near-Eye Displays + + +
+ Holographic displays hold the promise of providing authentic depth cues, +resulting in enhanced immersive visual experiences for near-eye applications. +However, current holographic displays are hindered by speckle noise, which +limits accurate reproduction of color and texture in displayed images. We +present HoloChrome, a polychromatic holographic display framework designed to +mitigate these limitations. HoloChrome utilizes an ultrafast, +wavelength-adjustable laser and a dual-Spatial Light Modulator (SLM) +architecture, enabling the multiplexing of a large set of discrete wavelengths +across the visible spectrum. By leveraging spatial separation in our dual-SLM +setup, we independently manipulate speckle patterns across multiple +wavelengths. This novel approach effectively reduces speckle noise through +incoherent averaging achieved by wavelength multiplexing. Our method is +complementary to existing speckle reduction techniques, offering a new pathway +to address this challenge. Furthermore, the use of polychromatic illumination +broadens the achievable color gamut compared to traditional three-color primary +holographic displays. + Our simulations and tabletop experiments validate that HoloChrome +significantly reduces speckle noise and expands the color gamut. These +advancements enhance the performance of holographic near-eye displays, moving +us closer to practical, immersive next-generation visual experiences. + +
+
+
+
+
+ + ☆ COSNet: A Novel Semantic Segmentation Network using Enhanced Boundaries + in Cluttered Scenes WACV 2025 + + +
+ Automated waste recycling aims to efficiently separate the recyclable objects +from the waste by employing vision-based systems. However, the presence of +varying shaped objects having different material types makes it a challenging +problem, especially in cluttered environments. Existing segmentation methods +perform reasonably on many semantic segmentation datasets by employing +multi-contextual representations, however, their performance is degraded when +utilized for waste object segmentation in cluttered scenarios. In addition, +plastic objects further increase the complexity of the problem due to their +translucent nature. To address these limitations, we introduce an efficacious +segmentation network, named COSNet, that uses boundary cues along with +multi-contextual information to accurately segment the objects in cluttered +scenes. COSNet introduces novel components including feature sharpening block +(FSB) and boundary enhancement module (BEM) for enhancing the features and +highlighting the boundary information of irregular waste objects in cluttered +environment. Extensive experiments on three challenging datasets including +ZeroWaste-f, SpectralWaste, and ADE20K demonstrate the effectiveness of the +proposed method. Our COSNet achieves a significant gain of 1.8% on ZeroWaste-f +and 2.1% on SpectralWaste datasets respectively in terms of mIoU metric. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ☆ AIDOVECL: AI-generated Dataset of Outpainted Vehicles for Eye-level + Classification and Localization + + +
+ Image labeling is a critical bottleneck in the development of computer vision +technologies, often constraining the potential of machine learning models due +to the time-intensive nature of manual annotations. This work introduces a +novel approach that leverages outpainting to address the problem of annotated +data scarcity by generating artificial contexts and annotations, significantly +reducing manual labeling efforts. We apply this technique to a particularly +acute challenge in autonomous driving, urban planning, and environmental +monitoring: the lack of diverse, eye-level vehicle images in desired classes. +Our dataset comprises AI-generated vehicle images obtained by detecting and +cropping vehicles from manually selected seed images, which are then outpainted +onto larger canvases to simulate varied real-world conditions. The outpainted +images include detailed annotations, providing high-quality ground truth data. +Advanced outpainting techniques and image quality assessments ensure visual +fidelity and contextual relevance. Augmentation with outpainted vehicles +improves overall performance metrics by up to 8\% and enhances prediction of +underrepresented classes by up to 20\%. This approach, exemplifying outpainting +as a self-annotating paradigm, presents a solution that enhances dataset +versatility across multiple domains of machine learning. The code and links to +datasets used in this study are available for further research and replication +at https://github.com/amir-kazemi/aidovecl. + +
+
+ comment: 19 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Nearest Neighbor Normalization Improves Multimodal Retrieval + + +
+ Multimodal models leverage large-scale pre-training to achieve strong but +still imperfect performance on tasks such as image captioning, visual question +answering, and cross-modal retrieval. In this paper, we present a simple and +efficient method for correcting errors in trained contrastive image-text +retrieval models with no additional training, called Nearest Neighbor +Normalization (NNN). We show an improvement on retrieval metrics in both text +retrieval and image retrieval for all of the contrastive models that we tested +(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used +(MS-COCO and Flickr30k). NNN requires a reference database, but does not +require any training on this database, and can even increase the retrieval +accuracy of a model after finetuning. + +
+
+
+
+
+ + ☆ Parameter choices in HaarPSI for IQA with medical images + + +
+ When developing machine learning models, image quality assessment (IQA) +measures are a crucial component for evaluation. However, commonly used IQA +measures have been primarily developed and optimized for natural images. In +many specialized settings, such as medical images, this poses an +often-overlooked problem regarding suitability. In previous studies, the IQA +measure HaarPSI showed promising behavior for natural and medical images. +HaarPSI is based on Haar wavelet representations and the framework allows +optimization of two parameters. So far, these parameters have been aligned for +natural images. Here, we optimize these parameters for two annotated medical +data sets, a photoacoustic and a chest X-Ray data set. We observe that they are +more sensitive to the parameter choices than the employed natural images, and +on the other hand both medical data sets lead to similar parameter values when +optimized. We denote the optimized setting, which improves the performance for +the medical images notably, by HaarPSI$_{MED}$. The results suggest that +adapting common IQA measures within their frameworks for medical images can +provide a valuable, generalizable addition to the employment of more specific +task-based measures. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Identifying Spatio-Temporal Drivers of Extreme Events NeurIPS 2024 + + +
+ The spatio-temporal relations of impacts of extreme events and their drivers +in climate data are not fully understood and there is a need of machine +learning approaches to identify such spatio-temporal relations from data. The +task, however, is very challenging since there are time delays between extremes +and their drivers, and the spatial response of such drivers is inhomogeneous. +In this work, we propose a first approach and benchmarks to tackle this +challenge. Our approach is trained end-to-end to predict spatio-temporally +extremes and spatio-temporally drivers in the physical input variables jointly. +By enforcing the network to predict extremes from spatio-temporal binary masks +of identified drivers, the network successfully identifies drivers that are +correlated with extremes. We evaluate our approach on three newly created +synthetic benchmarks, where two of them are based on remote sensing or +reanalysis climate data, and on two real-world reanalysis datasets. The source +code and datasets are publicly available at the project page +https://hakamshams.github.io/IDE. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ☆ Advanced Predictive Quality Assessment for Ultrasonic Additive + Manufacturing with Deep Learning Model + + +
+ Ultrasonic Additive Manufacturing (UAM) employs ultrasonic welding to bond +similar or dissimilar metal foils to a substrate, resulting in solid, +consolidated metal components. However, certain processing conditions can lead +to inter-layer defects, affecting the final product's quality. This study +develops a method to monitor in-process quality using deep learning-based +convolutional neural networks (CNNs). The CNN models were evaluated on their +ability to classify samples with and without embedded thermocouples across five +power levels (300W, 600W, 900W, 1200W, 1500W) using thermal images with +supervised labeling. Four distinct CNN classification models were created for +different scenarios including without (baseline) and with thermocouples, only +without thermocouples across power levels, only with thermocouples across power +levels, and combined without and with thermocouples across power levels. The +models achieved 98.29% accuracy on combined baseline and thermocouple images, +97.10% for baseline images across power levels, 97.43% for thermocouple images, +and 97.27% for both types across power levels. The high accuracy, above 97%, +demonstrates the system's effectiveness in identifying and classifying +conditions within the UAM process, providing a reliable tool for quality +assurance and process control in manufacturing environments. + +
+
+
+
+
+ + ☆ Deep Learning with HM-VGG: AI Strategies for Multi-modal Image Analysis + + +
+ This study introduces the Hybrid Multi-modal VGG (HM-VGG) model, a +cutting-edge deep learning approach for the early diagnosis of glaucoma. The +HM-VGG model utilizes an attention mechanism to process Visual Field (VF) data, +enabling the extraction of key features that are vital for identifying early +signs of glaucoma. Despite the common reliance on large annotated datasets, the +HM-VGG model excels in scenarios with limited data, achieving remarkable +results with small sample sizes. The model's performance is underscored by its +high metrics in Precision, Accuracy, and F1-Score, indicating its potential for +real-world application in glaucoma detection. The paper also discusses the +challenges associated with ophthalmic image analysis, particularly the +difficulty of obtaining large volumes of annotated data. It highlights the +importance of moving beyond single-modality data, such as VF or Optical +Coherence Tomography (OCT) images alone, to a multimodal approach that can +provide a richer, more comprehensive dataset. This integration of different +data types is shown to significantly enhance diagnostic accuracy. The HM- VGG +model offers a promising tool for doctors, streamlining the diagnostic process +and improving patient outcomes. Furthermore, its applicability extends to +telemedicine and mobile healthcare, making diagnostic services more accessible. +The research presented in this paper is a significant step forward in the field +of medical image processing and has profound implications for clinical +ophthalmology. + +
+
+
+
+
+ + ☆ TPC: Test-time Procrustes Calibration for Diffusion-based Human Image + Animation NeurIPS 2024 + + +
+ Human image animation aims to generate a human motion video from the inputs +of a reference human image and a target motion video. Current diffusion-based +image animation systems exhibit high precision in transferring human identity +into targeted motion, yet they still exhibit irregular quality in their +outputs. Their optimal precision is achieved only when the physical +compositions (i.e., scale and rotation) of the human shapes in the reference +image and target pose frame are aligned. In the absence of such alignment, +there is a noticeable decline in fidelity and consistency. Especially, in +real-world environments, this compositional misalignment commonly occurs, +posing significant challenges to the practical usage of current systems. To +this end, we propose Test-time Procrustes Calibration (TPC), which enhances the +robustness of diffusion-based image animation systems by maintaining optimal +performance even when faced with compositional misalignment, effectively +addressing real-world scenarios. The TPC provides a calibrated reference image +for the diffusion model, enhancing its capability to understand the +correspondence between human shapes in the reference and target images. Our +method is simple and can be applied to any diffusion-based image animation +system in a model-agnostic manner, improving the effectiveness at test time +without additional training. + +
+
+ comment: 24 pages, 16 figures, NeurIPS 2024 +
+
+
+
+
+ + ☆ Handwriting Recognition in Historical Documents with Multimodal LLM + + +
+ There is an immense quantity of historical and cultural documentation that +exists only as handwritten manuscripts. At the same time, performing OCR across +scripts and different handwriting styles has proven to be an enormously +difficult problem relative to the process of digitizing print. While recent +Transformer based models have achieved relatively strong performance, they rely +heavily on manually transcribed training data and have difficulty generalizing +across writers. Multimodal LLM, such as GPT-4v and Gemini, have demonstrated +effectiveness in performing OCR and computer vision tasks with few shot +prompting. In this paper, I evaluate the accuracy of handwritten document +transcriptions generated by Gemini against the current state of the art +Transformer based methods. + Keywords: Optical Character Recognition, Multimodal Language Models, Cultural +Preservation, Mass digitization, Handwriting Recognitio + +
+
+
+
+
+ + ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems + using Disparity Maps + + +
+ Face recognition technologies are increasingly used in various applications, +yet they are vulnerable to face spoofing attacks. These spoofing attacks often +involve unique 3D structures, such as printed papers or mobile device screens. +Although stereo-depth cameras can detect such attacks effectively, their +high-cost limits their widespread adoption. Conversely, two-sensor systems +without extrinsic calibration offer a cost-effective alternative but are unable +to calculate depth using stereo techniques. In this work, we propose a method +to overcome this challenge by leveraging facial attributes to derive disparity +information and estimate relative depth for anti-spoofing purposes, using +non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined +Disparity Model, that incorporates created disparity maps as a third modality +alongside the two original sensor modalities. We demonstrate the effectiveness +of the Disparity Model in countering various spoof attacks using a +comprehensive dataset collected from the Intel RealSense ID Solution F455. Our +method outperformed existing methods in the literature, achieving an Equal +Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False +Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the +errors of the best comparison method, respectively. Additionally, we introduce +a model ensemble that addresses 3D spoof attacks as well, achieving an EER of +2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a +state-of-the-art solution for the challenging task of anti-spoofing in +non-calibrated systems that lack depth information. + +
+
+
+
+
+ + ☆ Bayesian-guided Label Mapping for Visual Reprogramming + + +
+ Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained +vision models by adapting their input or output interfaces to solve downstream +tasks whose labels (i.e., downstream labels) might be totally different from +the labels associated with the pretrained models (i.e., pretrained labels). +When adapting the output interface, label mapping methods transform the +pretrained labels to downstream labels by establishing a gradient-free +one-to-one correspondence between the two sets of labels. However, in this +paper, we reveal that one-to-one mappings may overlook the complex relationship +between pretrained and downstream labels. Motivated by this observation, we +propose a Bayesian-guided Label Mapping (BLM) method. BLM constructs an +iteratively-updated probabilistic label mapping matrix, with each element +quantifying a pairwise relationship between pretrained and downstream labels. +The assignment of values to the constructed matrix is guided by Bayesian +conditional probability, considering the joint distribution of the downstream +labels and the labels predicted by the pretrained model on downstream samples. +Experiments conducted on both pretrained vision models (e.g., ResNeXt) and +vision-language models (e.g., CLIP) demonstrate the superior performance of BLM +over existing label mapping methods. The success of BLM also offers a +probabilistic lens through which to understand and analyze the effectiveness of +VR. Our code is available at https://github.com/tmlr-group/BayesianLM. + +
+
+
+
+
+ + ☆ Unveiling Synthetic Faces: How Synthetic Datasets Can Expose Real + Identities NeurIPS 2024 + + +
+ Synthetic data generation is gaining increasing popularity in different +computer vision applications. Existing state-of-the-art face recognition models +are trained using large-scale face datasets, which are crawled from the +Internet and raise privacy and ethical concerns. To address such concerns, +several works have proposed generating synthetic face datasets to train face +recognition models. However, these methods depend on generative models, which +are trained on real face images. In this work, we design a simple yet effective +membership inference attack to systematically study if any of the existing +synthetic face recognition datasets leak any information from the real data +used to train the generator model. We provide an extensive study on 6 +state-of-the-art synthetic face recognition datasets, and show that in all +these synthetic datasets, several samples from the original real dataset are +leaked. To our knowledge, this paper is the first work which shows the leakage +from training data of generator models into the generated synthetic face +recognition datasets. Our study demonstrates privacy pitfalls in synthetic face +recognition datasets and paves the way for future studies on generating +responsible synthetic face datasets. + +
+
+ comment: Accepted in NeurIPS 2024 Workshop on New Frontiers in Adversarial + Machine Learning +
+
+
+
+
+ + ☆ Re-assembling the past: The RePAIR dataset and benchmark for real world + 2D and 3D puzzle solving NeurIPS 2024 + + +
+ This paper proposes the RePAIR dataset that represents a challenging +benchmark to test modern computational and data driven methods for +puzzle-solving and reassembly tasks. Our dataset has unique properties that are +uncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and +fractures are realistic, caused by a collapse of a fresco during a World War II +bombing at the Pompeii archaeological park. The fragments are also eroded and +have missing pieces with irregular shapes and different dimensions, challenging +further the reassembly algorithms. The dataset is multi-modal providing high +resolution images with characteristic pictorial elements, detailed 3D scans of +the fragments and meta-data annotated by the archaeologists. Ground truth has +been generated through several years of unceasing fieldwork, including the +excavation and cleaning of each fragment, followed by manual puzzle solving by +archaeologists of a subset of approx. 1000 pieces among the 16000 available. +After digitizing all the fragments in 3D, a benchmark was prepared to challenge +current reassembly and puzzle-solving methods that often solve more simplistic +synthetic scenarios. The tested baselines show that there clearly exists a gap +to fill in solving this computationally complex problem. + +
+
+ comment: NeurIPS 2024, Track Datasets and Benchmarks, 10 pages +
+
+
+
+
+ + ☆ DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination WACV + + +
+ In the ever-evolving adversarial machine learning landscape, developing +effective defenses against patch attacks has become a critical challenge, +necessitating reliable solutions to safeguard real-world AI systems. Although +diffusion models have shown remarkable capacity in image synthesis and have +been recently utilized to counter $\ell_p$-norm bounded attacks, their +potential in mitigating localized patch attacks remains largely underexplored. +In this work, we propose DiffPAD, a novel framework that harnesses the power of +diffusion models for adversarial patch decontamination. DiffPAD first performs +super-resolution restoration on downsampled input images, then adopts +binarization, dynamic thresholding scheme and sliding window for effective +localization of adversarial patches. Such a design is inspired by the +theoretically derived correlation between patch size and diffusion restoration +error that is generalized across diverse patch attack scenarios. Finally, +DiffPAD applies inpainting techniques to the original input images with the +estimated patch region being masked. By integrating closed-form solutions for +super-resolution restoration and image inpainting into the conditional reverse +sampling process of a pre-trained diffusion model, DiffPAD obviates the need +for text guidance or fine-tuning. Through comprehensive experiments, we +demonstrate that DiffPAD not only achieves state-of-the-art adversarial +robustness against patch attacks but also excels in recovering naturalistic +images without patch remnants. + +
+
+ comment: Accepted to 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ☆ Assessing the Efficacy of Classical and Deep Neuroimaging Biomarkers in + Early Alzheimer's Disease Diagnosis SP + + +
+ Alzheimer's disease (AD) is the leading cause of dementia, and its early +detection is crucial for effective intervention, yet current diagnostic methods +often fall short in sensitivity and specificity. This study aims to detect +significant indicators of early AD by extracting and integrating various +imaging biomarkers, including radiomics, hippocampal texture descriptors, +cortical thickness measurements, and deep learning features. We analyze +structural magnetic resonance imaging (MRI) scans from the Alzheimer's Disease +Neuroimaging Initiative (ADNI) cohorts, utilizing comprehensive image analysis +and machine learning techniques. Our results show that combining multiple +biomarkers significantly improves detection accuracy. Radiomics and texture +features emerged as the most effective predictors for early AD, achieving AUCs +of 0.88 and 0.72 for AD and MCI detection, respectively. Although deep learning +features proved to be less effective than traditional approaches, incorporating +age with other biomarkers notably enhanced MCI detection performance. +Additionally, our findings emphasize the continued importance of classical +imaging biomarkers in the face of modern deep-learning approaches, providing a +robust framework for early AD diagnosis. + +
+
+ comment: SPIE Medical Imaging (MI25) +
+
+
+
+
+ + ☆ ImOV3D: Learning Open-Vocabulary Point Clouds 3D Object Detection from + Only 2D Images NeurIPS 2024 + + +
+ Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the +limited number of base categories labeled during the training phase. The +biggest bottleneck is the scarcity of annotated 3D data, whereas 2D image +datasets are abundant and richly annotated. Consequently, it is intuitive to +leverage the wealth of annotations in 2D images to alleviate the inherent data +scarcity in OV-3Det. In this paper, we push the task setup to its limits by +exploring the potential of using solely 2D images to learn OV-3Det. The major +challenges for this setup is the modality gap between training images and +testing point clouds, which prevents effective integration of 2D knowledge into +OV-3Det. To address this challenge, we propose a novel framework ImOV3D to +leverage pseudo multimodal representation containing both images and point +clouds (PC) to close the modality gap. The key of ImOV3D lies in flexible +modality conversion where 2D images can be lifted into 3D using monocular depth +estimation and can also be derived from 3D scenes through rendering. This +allows unifying both training images and testing point clouds into a common +image-PC representation, encompassing a wealth of 2D semantic information and +also incorporating the depth and structural characteristics of 3D spatial data. +We carefully conduct such conversion to minimize the domain gap between +training and test cases. Extensive experiments on two benchmark datasets, +SUNRGBD and ScanNet, show that ImOV3D significantly outperforms existing +methods, even in the absence of ground truth 3D training data. With the +inclusion of a minimal amount of real 3D data for fine-tuning, the performance +also significantly surpasses previous state-of-the-art. Codes and pre-trained +models are released on the https://github.com/yangtiming/ImOV3D. + +
+
+ comment: Accepted by NeurIPS 2024. Code link + https://github.com/yangtiming/ImOV3D +
+
+
+
+
+ + ☆ Localization, balance and affinity: a stronger multifaceted + collaborative salient object detector in remote sensing images + + +
+ Despite significant advancements in salient object detection(SOD) in optical +remote sensing images(ORSI), challenges persist due to the intricate edge +structures of ORSIs and the complexity of their contextual relationships. +Current deep learning approaches encounter difficulties in accurately +identifying boundary features and lack efficiency in collaboratively modeling +the foreground and background by leveraging contextual features. To address +these challenges, we propose a stronger multifaceted collaborative salient +object detector in ORSIs, termed LBA-MCNet, which incorporates aspects of +localization, balance, and affinity. The network focuses on accurately locating +targets, balancing detailed features, and modeling image-level global context +information. Specifically, we design the Edge Feature Adaptive Balancing and +Adjusting(EFABA) module for precise edge localization, using edge features to +guide attention to boundaries and preserve spatial details. Moreover, we design +the Global Distributed Affinity Learning(GDAL) module to model global context. +It captures global context by generating an affinity map from the encoders +final layer, ensuring effective modeling of global patterns. Additionally, deep +supervision during deconvolution further enhances feature representation. +Finally, we compared with 28 state of the art approaches on three publicly +available datasets. The results clearly demonstrate the superiority of our +method. + +
+
+
+
+
+ + ☆ JEMA: A Joint Embedding Framework for Scalable Co-Learning with + Multimodal Alignment + + +
+ This work introduces JEMA (Joint Embedding with Multimodal Alignment), a +novel co-learning framework tailored for laser metal deposition (LMD), a +pivotal process in metal additive manufacturing. As Industry 5.0 gains traction +in industrial applications, efficient process monitoring becomes increasingly +crucial. However, limited data and the opaque nature of AI present challenges +for its application in an industrial setting. JEMA addresses this challenges by +leveraging multimodal data, including multi-view images and metadata such as +process parameters, to learn transferable semantic representations. By applying +a supervised contrastive loss function, JEMA enables robust learning and +subsequent process monitoring using only the primary modality, simplifying +hardware requirements and computational overhead. We investigate the +effectiveness of JEMA in LMD process monitoring, focusing specifically on its +generalization to downstream tasks such as melt pool geometry prediction, +achieved without extensive fine-tuning. Our empirical evaluation demonstrates +the high scalability and performance of JEMA, particularly when combined with +Vision Transformer models. We report an 8% increase in performance in +multimodal settings and a 1% improvement in unimodal settings compared to +supervised contrastive learning. Additionally, the learned embedding +representation enables the prediction of metadata, enhancing interpretability +and making possible the assessment of the added metadata's contributions. Our +framework lays the foundation for integrating multisensor data with metadata, +enabling diverse downstream tasks within the LMD domain and beyond. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ TrAct: Making First-layer Pre-Activations Trainable NeurIPS 2024 + + +
+ We consider the training of the first layer of vision models and notice the +clear relationship between pixel values and gradient update magnitudes: the +gradients arriving at the weights of a first layer are by definition directly +proportional to (normalized) input pixel values. Thus, an image with low +contrast has a smaller impact on learning than an image with higher contrast, +and a very bright or very dark image has a stronger impact on the weights than +an image with moderate brightness. In this work, we propose performing gradient +descent on the embeddings produced by the first layer of the model. However, +switching to discrete inputs with an embedding layer is not a reasonable option +for vision models. Thus, we propose the conceptual procedure of (i) a gradient +descent step on first layer activations to construct an activation proposal, +and (ii) finding the optimal weights of the first layer, i.e., those weights +which minimize the squared distance to the activation proposal. We provide a +closed form solution of the procedure and adjust it for robust stochastic +training while computing everything efficiently. Empirically, we find that +TrAct (Training Activations) speeds up training by factors between 1.25x and 4x +while requiring only a small computational overhead. We demonstrate the utility +of TrAct with different optimizers for a range of different vision models +including convolutional and transformer architectures. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Image Synthesis with Class-Aware Semantic Diffusion Models for Surgical + Scene Segmentation + + +
+ Surgical scene segmentation is essential for enhancing surgical precision, +yet it is frequently compromised by the scarcity and imbalance of available +data. To address these challenges, semantic image synthesis methods based on +generative adversarial networks and diffusion models have been developed. +However, these models often yield non-diverse images and fail to capture small, +critical tissue classes, limiting their effectiveness. In response, we propose +the Class-Aware Semantic Diffusion Model (CASDM), a novel approach which +utilizes segmentation maps as conditions for image synthesis to tackle data +scarcity and imbalance. Novel class-aware mean squared error and class-aware +self-perceptual loss functions have been defined to prioritize critical, less +visible classes, thereby enhancing image quality and relevance. Furthermore, to +our knowledge, we are the first to generate multi-class segmentation maps using +text prompts in a novel fashion to specify their contents. These maps are then +used by CASDM to generate surgical scene images, enhancing datasets for +training and validating segmentation models. Our evaluation, which assesses +both image quality and downstream segmentation performance, demonstrates the +strong effectiveness and generalisability of CASDM in producing realistic +image-map pairs, significantly advancing surgical scene segmentation across +diverse and challenging datasets. + +
+
+
+
+
+ + ☆ MV-CC: Mask Enhanced Video Model for Remote Sensing Change Caption + + +
+ Remote sensing image change caption (RSICC) aims to provide natural language +descriptions for bi-temporal remote sensing images. Since Change Caption (CC) +task requires both spatial and temporal features, previous works follow an +encoder-fusion-decoder architecture. They use an image encoder to extract +spatial features and the fusion module to integrate spatial features and +extract temporal features, which leads to increasingly complex manual design of +the fusion module. In this paper, we introduce a novel video model-based +paradigm without design of the fusion module and propose a Mask-enhanced Video +model for Change Caption (MV-CC). Specifically, we use the off-the-shelf video +encoder to simultaneously extract the temporal and spatial features of +bi-temporal images. Furthermore, the types of changes in the CC are set based +on specific task requirements, and to enable the model to better focus on the +regions of interest, we employ masks obtained from the Change Detection (CD) +method to explicitly guide the CC model. Experimental results demonstrate that +our proposed method can obtain better performance compared with other +state-of-the-art RSICC methods. The code is available at +https://github.com/liuruixun/MV-CC. + +
+
+
+
+
+ + ☆ Manipulating Vehicle 3D Shapes through Latent Space Editing + + +
+ Although 3D object editing has the potential to significantly influence +various industries, recent research in 3D generation and editing has primarily +focused on converting text and images into 3D models, often overlooking the +need for fine-grained control over the editing of existing 3D objects. This +paper introduces a framework that employs a pre-trained regressor, enabling +continuous, precise, attribute-specific modifications to both the stylistic and +geometric attributes of vehicle 3D models. Our method not only preserves the +inherent identity of vehicle 3D objects, but also supports multi-attribute +editing, allowing for extensive customization without compromising the model's +structural integrity. Experimental results demonstrate the efficacy of our +approach in achieving detailed edits on various vehicle 3D models. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ BitStack: Fine-Grained Size Control for Compressed Large Language Models + in Variable Memory Environments + + +
+ Large language models (LLMs) have revolutionized numerous applications, yet +their deployment remains challenged by memory constraints on local devices. +While scaling laws have enhanced LLM capabilities, the primary bottleneck has +shifted from \textit{capability} to \textit{availability}, emphasizing the need +for efficient memory management. Traditional compression methods, such as +quantization, often require predefined compression ratios and separate +compression processes for each setting, complicating deployment in variable +memory environments. In this paper, we introduce \textbf{BitStack}, a novel, +training-free weight compression approach that enables megabyte-level +trade-offs between memory usage and model performance. By leveraging weight +decomposition, BitStack can dynamically adjust the model size with minimal +transmission between running memory and storage devices. Our approach +iteratively decomposes weight matrices while considering the significance of +each parameter, resulting in an approximately 1-bit per parameter residual +block in each decomposition iteration. These blocks are sorted and stacked in +storage as basic transmission units, with different quantities loaded based on +current memory availability. Extensive experiments across a wide range of tasks +demonstrate that, despite offering fine-grained size control, BitStack +consistently matches or surpasses strong quantization baselines, particularly +at extreme compression ratios. To the best of our knowledge, this is the first +decomposition-based method that effectively bridges the gap to practical +compression techniques like quantization. Code is available at +https://github.com/xinghaow99/BitStack. + +
+
+
+
+
+ + ☆ Uncertainty Estimation for 3D Object Detection via Evidential Learning + + +
+ 3D object detection is an essential task for computer vision applications in +autonomous vehicles and robotics. However, models often struggle to quantify +detection reliability, leading to poor performance on unfamiliar scenes. We +introduce a framework for quantifying uncertainty in 3D object detection by +leveraging an evidential learning loss on Bird's Eye View representations in +the 3D detector. These uncertainty estimates require minimal computational +overhead and are generalizable across different architectures. We demonstrate +both the efficacy and importance of these uncertainty estimates on identifying +out-of-distribution scenes, poorly localized objects, and missing (false +negative) detections; our framework consistently improves over baselines by +10-20% on average. Finally, we integrate this suite of tasks into a system +where a 3D object detector auto-labels driving scenes and our uncertainty +estimates verify label correctness before the labels are used to train a second +model. Here, our uncertainty-driven verification results in a 1% improvement in +mAP and a 1-2% improvement in NDS. + +
+
+
+
+
+ + ☆ From Web Data to Real Fields: Low-Cost Unsupervised Domain Adaptation + for Agricultural Robots + + +
+ In precision agriculture, vision models often struggle with new, unseen +fields where crops and weeds have been influenced by external factors, +resulting in compositions and appearances that differ from the learned +distribution. This paper aims to adapt to specific fields at low cost using +Unsupervised Domain Adaptation (UDA). We explore a novel domain shift from a +diverse, large pool of internet-sourced data to a small set of data collected +by a robot at specific locations, minimizing the need for extensive on-field +data collection. Additionally, we introduce a novel module -- the Multi-level +Attention-based Adversarial Discriminator (MAAD) -- which can be integrated at +the feature extractor level of any detection model. In this study, we +incorporate MAAD with CenterNet to simultaneously detect leaf, stem, and vein +instances. Our results show significant performance improvements in the +unlabeled target domain compared to baseline models, with a 7.5% increase in +object detection accuracy and a 5.1% improvement in keypoint detection. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Text-DiFuse: An Interactive Multi-Modal Image Fusion Framework based on + Text-modulated Diffusion Model NeurIPS 2024 + + +
+ Existing multi-modal image fusion methods fail to address the compound +degradations presented in source images, resulting in fusion images plagued by +noise, color bias, improper exposure, \textit{etc}. Additionally, these methods +often overlook the specificity of foreground objects, weakening the salience of +the objects of interest within the fused images. To address these challenges, +this study proposes a novel interactive multi-modal image fusion framework +based on the text-modulated diffusion model, called Text-DiFuse. First, this +framework integrates feature-level information integration into the diffusion +process, allowing adaptive degradation removal and multi-modal information +fusion. This is the first attempt to deeply and explicitly embed information +fusion within the diffusion process, effectively addressing compound +degradation in image fusion. Second, by embedding the combination of the text +and zero-shot location model into the diffusion fusion process, a +text-controlled fusion re-modulation strategy is developed. This enables +user-customized text control to improve fusion performance and highlight +foreground objects in the fused images. Extensive experiments on diverse public +datasets show that our Text-DiFuse achieves state-of-the-art fusion performance +across various scenarios with complex degradation. Moreover, the semantic +segmentation experiment validates the significant enhancement in semantic +performance achieved by our text-controlled fusion re-modulation strategy. The +code is publicly available at https://github.com/Leiii-Cao/Text-DiFuse. + +
+
+ comment: Accepted by the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI + Detection NeurIPS 2024 + + +
+ Detecting Human-Object Interactions (HOI) in zero-shot settings, where models +must handle unseen classes, poses significant challenges. Existing methods that +rely on aligning visual encoders with large Vision-Language Models (VLMs) to +tap into the extensive knowledge of VLMs, require large, computationally +expensive models and encounter training difficulties. Adapting VLMs with prompt +learning offers an alternative to direct alignment. However, fine-tuning on +task-specific datasets often leads to overfitting to seen classes and +suboptimal performance on unseen classes, due to the absence of unseen class +labels. To address these challenges, we introduce a novel prompt learning-based +framework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce +Large Language Model (LLM) and VLM guidance for learnable prompts, integrating +detailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks. +However, because training datasets contain seen-class labels alone, fine-tuning +VLMs on such datasets tends to optimize learnable prompts for seen classes +instead of unseen ones. Therefore, we design prompt learning for unseen classes +using information from related seen classes, with LLMs utilized to highlight +the differences between unseen and related seen classes. Quantitative +evaluations on benchmark datasets demonstrate that our EZ-HOI achieves +state-of-the-art performance across various zero-shot settings with only 10.35% +to 33.95% of the trainable parameters compared to existing methods. Code is +available at https://github.com/ChelsieLei/EZ-HOI. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ AllClear: A Comprehensive Dataset and Benchmark for Cloud Removal in + Satellite Imagery NeurIPS 2024 + + +
+ Clouds in satellite imagery pose a significant challenge for downstream +applications. A major challenge in current cloud removal research is the +absence of a comprehensive benchmark and a sufficiently large and diverse +training dataset. To address this problem, we introduce the largest public +dataset -- $\textit{AllClear}$ for cloud removal, featuring 23,742 globally +distributed regions of interest (ROIs) with diverse land-use patterns, +comprising 4 million images in total. Each ROI includes complete temporal +captures from the year 2022, with (1) multi-spectral optical imagery from +Sentinel-2 and Landsat 8/9, (2) synthetic aperture radar (SAR) imagery from +Sentinel-1, and (3) auxiliary remote sensing products such as cloud masks and +land cover maps. We validate the effectiveness of our dataset by benchmarking +performance, demonstrating the scaling law -- the PSNR rises from $28.47$ to +$33.87$ with $30\times$ more data, and conducting ablation studies on the +temporal length and the importance of individual modalities. This dataset aims +to provide comprehensive coverage of the Earth's surface and promote better +cloud removal results. + +
+
+ comment: Accepted at NeurIPS 2024 Datasets and Benchmarks Track. Code and data + available at https://allclear.cs.cornell.edu/ +
+
+
+
+
+ + ☆ Airway Labeling Meets Clinical Applications: Reflecting Topology + Consistency and Outliers via Learnable Attentions + + +
+ Accurate airway anatomical labeling is crucial for clinicians to identify and +navigate complex bronchial structures during bronchoscopy. Automatic airway +anatomical labeling is challenging due to significant individual variability +and anatomical variations. Previous methods are prone to generate inconsistent +predictions, which is harmful for preoperative planning and intraoperative +navigation. This paper aims to address these challenges by proposing a novel +method that enhances topological consistency and improves the detection of +abnormal airway branches. + We propose a novel approach incorporating two modules: the Soft Subtree +Consistency (SSC) and the Abnormal Branch Saliency (ABS). The SSC module +constructs a soft subtree to capture clinically relevant topological +relationships, allowing for flexible feature aggregation within and across +subtrees. The ABS module facilitates the interaction between node features and +prototypes to distinguish abnormal branches, preventing the erroneous +aggregation of features between normal and abnormal nodes. + Evaluated on a challenging dataset characterized by severe airway distortion +and atrophy, our method achieves superior performance compared to +state-of-the-art approaches. Specifically, it attains a 91.4% accuracy at the +segmental level and an 83.7% accuracy at the subsegmental level, representing a +1.4% increase in subsegmental accuracy and a 3.1% increase in topological +consistency. Notably, the method demonstrates reliable performance in cases +with disease-induced airway deformities, ensuring consistent and accurate +labeling. + +
+
+
+
+
+ + ☆ Stereo-Talker: Audio-driven 3D Human Synthesis with Prior-Guided + Mixture-of-Experts + + +
+ This paper introduces Stereo-Talker, a novel one-shot audio-driven human +video synthesis system that generates 3D talking videos with precise lip +synchronization, expressive body gestures, temporally consistent +photo-realistic quality, and continuous viewpoint control. The process follows +a two-stage approach. In the first stage, the system maps audio input to +high-fidelity motion sequences, encompassing upper-body gestures and facial +expressions. To enrich motion diversity and authenticity, large language model +(LLM) priors are integrated with text-aligned semantic audio features, +leveraging LLMs' cross-modal generalization power to enhance motion quality. In +the second stage, we improve diffusion-based video generation models by +incorporating a prior-guided Mixture-of-Experts (MoE) mechanism: a view-guided +MoE focuses on view-specific attributes, while a mask-guided MoE enhances +region-based rendering stability. Additionally, a mask prediction module is +devised to derive human masks from motion data, enhancing the stability and +accuracy of masks and enabling mask guiding during inference. We also introduce +a comprehensive human video dataset with 2,203 identities, covering diverse +body gestures and detailed annotations, facilitating broad generalization. The +code, data, and pre-trained models will be released for research purposes. + +
+
+
+
+
+ + ☆ Counterfactual MRI Data Augmentation using Conditional Denoising + Diffusion Generative Models + + +
+ Deep learning (DL) models in medical imaging face challenges in +generalizability and robustness due to variations in image acquisition +parameters (IAP). In this work, we introduce a novel method using conditional +denoising diffusion generative models (cDDGMs) to generate counterfactual +magnetic resonance (MR) images that simulate different IAP without altering +patient anatomy. We demonstrate that using these counterfactual images for data +augmentation can improve segmentation accuracy, particularly in +out-of-distribution settings, enhancing the overall generalizability and +robustness of DL models across diverse imaging conditions. Our approach shows +promise in addressing domain and covariate shifts in medical imaging. The code +is publicly available at https: +//github.com/pedromorao/Counterfactual-MRI-Data-Augmentation + +
+
+
+
+
+ + ♻ ☆ RACCooN: A Versatile Instructional Video Editing Framework with + Auto-Generated Narratives + + +
+ Recent video generative models primarily rely on carefully written text +prompts for specific tasks, like inpainting or style editing. They require +labor-intensive textual descriptions for input videos, hindering their +flexibility to adapt personal/raw videos to user specifications. This paper +proposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video +generative framework that supports multiple video editing capabilities such as +removal, addition, and modification, through a unified pipeline. RACCooN +consists of two principal stages: Video-to-Paragraph (V2P) and +Paragraph-to-Video (P2V). In the V2P stage, we automatically describe video +scenes in well-structured natural language, capturing both the holistic context +and focused object details. Subsequently, in the P2V stage, users can +optionally refine these descriptions to guide the video diffusion model, +enabling various modifications to the input video, such as removing, changing +subjects, and/or adding new objects. The proposed approach stands out from +other methods through several significant contributions: (1) RACCooN suggests a +multi-granular spatiotemporal pooling strategy to generate well-structured +video descriptions, capturing both the broad context and object details without +requiring complex human annotations, simplifying precise video content editing +based on text for users. (2) Our video generative model incorporates +auto-generated narratives or instructions to enhance the quality and accuracy +of the generated content. (3) RACCooN also plans to imagine new objects in a +given video, so users simply prompt the model to receive a detailed video +editing plan for complex video editing. The proposed framework demonstrates +impressive versatile capabilities in video-to-paragraph generation, video +content editing, and can be incorporated into other SoTA video generative +models for further enhancement. + +
+
+ comment: The first two authors contribute equally. Project Page: + https://raccoon-mllm-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ VILA$^2$: VILA Augmented VILA + + +
+ While visual language model architectures and training infrastructures +advance rapidly, data curation remains under-explored where quantity and +quality become a bottleneck. Existing work either crawls extra Internet data +with a loose guarantee of quality or distills from black-box proprietary +models, e.g., GPT-4V / Gemini that are API frequency and performance bounded. +This work enables a VLM to improve itself via data enhancement, exploiting its +generative nature. We introduce a simple yet effective VLM augmentation scheme +that includes a self-augment step and a specialist-augment step to iteratively +improve data quality and hence, model performance. In the self-augment step, +the instruction-finetuned VLM recaptions its pretraining caption datasets and +then retrains from scratch leveraging refined data. Without any expensive +human-in-the-loop annotation, we observe improvements in data quality and +downstream accuracy boosts with three self-augmentation rounds -- a viable free +lunch to the current VLM training recipe. When self-augmentation saturates, we +augment the caption diversity by leveraging specialty skills picked up from +instruction finetuning. We finetune VLM specialists from the self-augmented VLM +with domain-specific experts, including spatial, grounding, and OCR, to fuse +task-aware synthetic data into the pretraining stage. Data quality improvements +and hallucination reductions are cross-checked by VLM (GPT-4V, Gemini) and +human judges. Combining self-augmentation and specialist-augmented training, +VILA$^2$ consistently improves the accuracy on a wide range of benchmarks over +the prior art, producing a reusable pretraining dataset that is 300x more +cost-efficient than human labeling. + +
+
+
+
+
+ + ♻ ☆ Exploring Behavior-Relevant and Disentangled Neural Dynamics with + Generative Diffusion Models + + +
+ Understanding the neural basis of behavior is a fundamental goal in +neuroscience. Current research in large-scale neuro-behavioral data analysis +often relies on decoding models, which quantify behavioral information in +neural data but lack details on behavior encoding. This raises an intriguing +scientific question: ``how can we enable in-depth exploration of neural +representations in behavioral tasks, revealing interpretable neural dynamics +associated with behaviors''. However, addressing this issue is challenging due +to the varied behavioral encoding across different brain regions and mixed +selectivity at the population level. To tackle this limitation, our approach, +named ``BeNeDiff'', first identifies a fine-grained and disentangled neural +subspace using a behavior-informed latent variable model. It then employs +state-of-the-art generative diffusion models to synthesize behavior videos that +interpret the neural dynamics of each latent factor. We validate the method on +multi-session datasets containing widefield calcium imaging recordings across +the dorsal cortex. Through guiding the diffusion model to activate individual +latent factors, we verify that the neural dynamics of latent factors in the +disentangled neural subspace provide interpretable quantifications of the +behaviors of interest. At the same time, the neural subspace in BeNeDiff +demonstrates high disentanglement and neural reconstruction quality. + +
+
+
+
+
+ + ♻ ☆ Elliptical Attention NeurIPS 2024 + + +
+ Pairwise dot-product self-attention is key to the success of transformers +that achieve state-of-the-art performance across a variety of applications in +language and vision. This dot-product self-attention computes attention weights +among the input tokens using Euclidean distance, which makes the model prone to +representation collapse and vulnerable to contaminated samples. In this paper, +we propose using a Mahalanobis distance metric for computing the attention +weights to stretch the underlying feature space in directions of high +contextual relevance. In particular, we define a hyper-ellipsoidal neighborhood +around each query to increase the attention weights of the tokens lying in the +contextually important directions. We term this novel class of attention +Elliptical Attention. Our Elliptical Attention provides two benefits: 1) +reducing representation collapse and 2) enhancing the model's robustness as +Elliptical Attention pays more attention to contextually relevant information +rather than focusing on some small subset of informative features. We +empirically demonstrate the advantages of Elliptical Attention over the +baseline dot-product attention and state-of-the-art attention methods on +various practical tasks, including object classification, image segmentation, +and language modeling across different data modalities. + +
+
+ comment: 10 pages in the main text. Published at NeurIPS 2024. The code is + available at https://github.com/stefvk/Elliptical-Attention +
+
+
+
+
+ + ♻ ☆ NASM: Neural Anisotropic Surface Meshing SIGGRAPH + + +
+ This paper introduces a new learning-based method, NASM, for anisotropic +surface meshing. Our key idea is to propose a graph neural network to embed an +input mesh into a high-dimensional (high-d) Euclidean embedding space to +preserve curvature-based anisotropic metric by using a dot product loss between +high-d edge vectors. This can dramatically reduce the computational time and +increase the scalability. Then, we propose a novel feature-sensitive remeshing +on the generated high-d embedding to automatically capture sharp geometric +features. We define a high-d normal metric, and then derive an automatic +differentiation on a high-d centroidal Voronoi tessellation (CVT) optimization +with the normal metric to simultaneously preserve geometric features and +curvature anisotropy that exhibit in the original 3D shapes. To our knowledge, +this is the first time that a deep learning framework and a large dataset are +proposed to construct a high-d Euclidean embedding space for 3D anisotropic +surface meshing. Experimental results are evaluated and compared with the +state-of-the-art in anisotropic surface meshing on a large number of surface +models from Thingi10K dataset as well as tested on extensive unseen 3D shapes +from Multi-Garment Network dataset and FAUST human dataset. + +
+
+ comment: SIGGRAPH Asia 2024 (Conference Track) +
+
+
+
+
+ + ♻ ☆ SRA: A Novel Method to Improve Feature Embedding in Self-supervised + Learning for Histopathological Images + + +
+ Self-supervised learning has become a cornerstone in various areas, +particularly histopathological image analysis. Image augmentation plays a +crucial role in self-supervised learning, as it generates variations in image +samples. However, traditional image augmentation techniques often overlook the +unique characteristics of histopathological images. In this paper, we propose a +new histopathology-specific image augmentation method called stain +reconstruction augmentation (SRA). We integrate our SRA with MoCo v3, a leading +model in self-supervised contrastive learning, along with our additional +contrastive loss terms, and call the new model SRA-MoCo v3. We demonstrate that +our SRA-MoCo v3 always outperforms the standard MoCo v3 across various +downstream tasks and achieves comparable or superior performance to other +foundation models pre-trained on significantly larger histopathology datasets. + +
+
+ comment: Hamid Manoochehri and Bodong Zhang contributed equally to this work +
+
+
+
+
+ + ♻ ☆ CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale + + +
+ Measuring biodiversity is crucial for understanding ecosystem health. While +prior works have developed machine learning models for taxonomic classification +of photographic images and DNA separately, in this work, we introduce a +multimodal approach combining both, using CLIP-style contrastive learning to +align images, barcode DNA, and text-based representations of taxonomic labels +in a unified embedding space. This allows for accurate classification of both +known and unknown insect species without task-specific fine-tuning, leveraging +contrastive learning for the first time to fuse DNA and image data. Our method +surpasses previous single-modality approaches in accuracy by over 8% on +zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. + +
+
+ comment: 25 pages with 11 figures +
+
+
+
+
+ + ♻ ☆ Domain-Adaptive Pre-training of Self-Supervised Foundation Models for + Medical Image Classification in Gastrointestinal Endoscopy + + +
+ Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE) +diagnostics by offering a non-invasive method for capturing detailed images of +the gastrointestinal tract, enabling early disease detection. However, its +potential is limited by the sheer volume of images generated during the imaging +procedure, which can take anywhere from 6-8 hours and often produce up to 1 +million images, necessitating automated analysis. Additionally, the variability +of these images, combined with the need for expert annotations and the scarcity +of large, high-quality labeled datasets, constrains the effectiveness of +current medical image analysis models. To address this, we introduce a novel +large gastrointestinal endoscopy dataset, called EndoExtend24, created by +merging and re-stratifying the train/test splits of ten existing public and +private datasets, ensuring no overlap of patient data across splits. +EndoExtend24 includes over 226,000 labeled images, as well as dynamic class +mappings, which allow unified training across datasets with differing labeling +granularity, supporting up to 123 distinct pathological findings. Further, we +propose to leverage domain adaptive pre-training of foundation models in +computer vision trained with self-supervision on generic image data, to adapt +them to the task of GIE medical diagnosis. Specifically, the EVA-02 model, +which is based on the vision transformer architecture and was trained on +ImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is +pre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and +finally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental +results demonstrate strong performance with an F1 score of 0.88, an improvement +of about 39% over the baseline model's F1 score of 0.49. Additionally, the +model achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%. + +
+
+
+
+
+ + ♻ ☆ SegLLM: Multi-round Reasoning Segmentation + + +
+ We present SegLLM, a novel multi-round interactive reasoning segmentation +model that enhances LLM-based segmentation by exploiting conversational memory +of both visual and textual outputs. By leveraging a mask-aware multimodal LLM, +SegLLM re-integrates previous segmentation results into its input stream, +enabling it to reason about complex user intentions and segment objects in +relation to previously identified entities, including positional, +interactional, and hierarchical relationships, across multiple interactions. +This capability allows SegLLM to respond to visual and text queries in a +chat-like manner. Evaluated on the newly curated MRSeg benchmark, SegLLM +outperforms existing methods in multi-round interactive reasoning segmentation +by over 20%. Additionally, we observed that training on multi-round reasoning +segmentation data enhances performance on standard single-round referring +segmentation and localization tasks, resulting in a 5.5% increase in cIoU for +referring expression segmentation and a 4.5% improvement in Acc@0.5 for +referring expression localization. + +
+
+ comment: 22 pages, 10 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Tensor-Based Synchronization and the Low-Rankness of the Block Trifocal + Tensor NeurIPS 2024 + + +
+ The block tensor of trifocal tensors provides crucial geometric information +on the three-view geometry of a scene. The underlying synchronization problem +seeks to recover camera poses (locations and orientations up to a global +transformation) from the block trifocal tensor. We establish an explicit Tucker +factorization of this tensor, revealing a low multilinear rank of $(6,4,4)$ +independent of the number of cameras under appropriate scaling conditions. We +prove that this rank constraint provides sufficient information for camera +recovery in the noiseless case. The constraint motivates a synchronization +algorithm based on the higher-order singular value decomposition of the block +trifocal tensor. Experimental comparisons with state-of-the-art global +synchronization methods on real datasets demonstrate the potential of this +algorithm for significantly improving location estimation accuracy. Overall +this work suggests that higher-order interactions in synchronization problems +can be exploited to improve performance, beyond the usual pairwise-based +approaches. + +
+
+ comment: 33 pages, 3 figures. Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations + for Vision Foundation Models ICML 2024 + + +
+ Vision transformers (ViTs) have emerged as a significant area of focus, +particularly for their capacity to be jointly trained with large language +models and to serve as robust vision foundation models. Yet, the development of +trustworthy explanation methods for ViTs has lagged, particularly in the +context of post-hoc interpretations of ViT predictions. Existing sub-image +selection approaches, such as feature-attribution and conceptual models, fall +short in this regard. This paper proposes five desiderata for explaining ViTs +-- faithfulness, stability, sparsity, multi-level structure, and parsimony -- +and demonstrates the inadequacy of current methods in meeting these criteria +comprehensively. We introduce a variational Bayesian explanation framework, +dubbed ProbAbilistic Concept Explainers (PACE), which models the distributions +of patch embeddings to provide trustworthy post-hoc conceptual explanations. +Our qualitative analysis reveals the distributions of patch-level concepts, +elucidating the effectiveness of ViTs by modeling the joint distribution of +patch embeddings and ViT's predictions. Moreover, these patch-level +explanations bridge the gap between image-level and dataset-level explanations, +thus completing the multi-level structure of PACE. Through extensive +experiments on both synthetic and real-world datasets, we demonstrate that PACE +surpasses state-of-the-art methods in terms of the defined desiderata. + +
+
+ comment: Proceedings of the 41st International Conference on Machine Learning + (ICML 2024) +
+
+
+
+
+ + ♻ ☆ Adaptive Aggregation Weights for Federated Segmentation of Pancreas MRI + + +
+ Federated learning (FL) enables collaborative model training across +institutions without sharing sensitive data, making it an attractive solution +for medical imaging tasks. However, traditional FL methods, such as Federated +Averaging (FedAvg), face difficulties in generalizing across domains due to +variations in imaging protocols and patient demographics across institutions. +This challenge is particularly evident in pancreas MRI segmentation, where +anatomical variability and imaging artifacts significantly impact performance. +In this paper, we conduct a comprehensive evaluation of FL algorithms for +pancreas MRI segmentation and introduce a novel approach that incorporates +adaptive aggregation weights. By dynamically adjusting the contribution of each +client during model aggregation, our method accounts for domain-specific +differences and improves generalization across heterogeneous datasets. +Experimental results demonstrate that our approach enhances segmentation +accuracy and reduces the impact of domain shift compared to conventional FL +methods while maintaining privacy-preserving capabilities. Significant +performance improvements are observed across multiple hospitals (centers). + +
+
+
+
+
+ + ♻ ☆ RGB2Point: 3D Point Cloud Generation from Single RGB Images WACV + + +
+ We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud +generation based on Transformer. RGB2Point takes an input image of an object +and generates a dense 3D point cloud. Contrary to prior works based on CNN +layers and diffusion denoising approaches, we use pre-trained Transformer +layers that are fast and generate high-quality point clouds with consistent +quality over available categories. Our generated point clouds demonstrate high +quality on a real-world dataset, as evidenced by improved Chamfer distance +(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current +state-of-the-art. Additionally, our approach shows a better quality on a +synthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's +distance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1% +more consistent high-quality results across various object categories compared +to prior works. Furthermore, RGB2Point is computationally efficient, requiring +only 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and +our implementation generates the results 15,133x faster than a SOTA +diffusion-based model. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI + Segmentation WACV 2025 + + +
+ Current cardiac cine magnetic resonance image (cMR) studies focus on the end +diastole (ED) and end systole (ES) phases, while ignoring the abundant temporal +information in the whole image sequence. This is because whole sequence +segmentation is currently a tedious process and inaccurate. Conventional whole +sequence segmentation approaches first estimate the motion field between +frames, which is then used to propagate the mask along the temporal axis. +However, the mask propagation results could be prone to error, especially for +the basal and apex slices, where through-plane motion leads to significant +morphology and structural change during the cardiac cycle. Inspired by recent +advances in video object segmentation (VOS), based on spatio-temporal memory +(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised +whole heart and whole sequence cMR segmentation. Our CSTM network takes full +advantage of the spatial, scale, temporal and through-plane continuity prior of +the underlying heart anatomy structures, to achieve accurate and fast 4D +segmentation. Results of extensive experiments across multiple cMR datasets +show that our method can improve the 4D cMR segmentation performance, +especially for the hard-to-segment regions. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Multi-Object Hallucination in Vision-Language Models NeurIPS 2024 + + +
+ Large vision language models (LVLMs) often suffer from object hallucination, +producing objects not present in the given images. While current benchmarks for +object hallucination primarily concentrate on the presence of a single object +class rather than individual entities, this work systematically investigates +multi-object hallucination, examining how models misperceive (e.g., invent +nonexistent objects or become distracted) when tasked with focusing on multiple +objects simultaneously. We introduce Recognition-based Object Probing +Evaluation (ROPE), an automated evaluation protocol that considers the +distribution of object classes within a single image during testing and uses +visual referring prompts to eliminate ambiguity. With comprehensive empirical +studies and analysis of potential factors leading to multi-object +hallucination, we found that (1). LVLMs suffer more hallucinations when +focusing on multiple objects compared to a single object. (2). The tested +object class distribution affects hallucination behaviors, indicating that +LVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory +behaviors are influenced by data-specific factors, salience and frequency, and +model intrinsic behaviors. We hope to enable LVLMs to recognize and reason +about multiple objects that often occur in realistic visual scenes, provide +insights, and quantify our progress towards mitigating the issues. + +
+
+ comment: Accepted to NeurIPS 2024 | Project page: + https://multi-object-hallucination.github.io/ +
+
+
+
+
+ + ♻ ☆ LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory + Equipment with Different Degrees of Transparency via 6D Pose Estimation + + +
+ Many modern robotic systems operate autonomously, however they often lack the +ability to accurately analyze the environment and adapt to changing external +conditions, while teleoperation systems often require special operator skills. +In the field of laboratory automation, the number of automated processes is +growing, however such systems are usually developed to perform specific tasks. +In addition, many of the objects used in this field are transparent, making it +difficult to analyze them using visual channels. The contributions of this work +include the development of a robotic framework with autonomous mode for +manipulating liquid-filled objects with different degrees of transparency in +complex pose combinations. The conducted experiments demonstrated the +robustness of the designed visual perception system to accurately estimate +object poses for autonomous manipulation, and confirmed the performance of the +algorithms in dexterous operations such as liquid dispensing. The proposed +robotic framework can be applied for laboratory automation, since it allows +solving the problem of performing non-trivial manipulation tasks with the +analysis of object poses of varying degrees of transparency and liquid levels, +requiring high accuracy and repeatability. + +
+
+ comment: Accepted to the 2024 IEEE International Conference on Robotics and + Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ♻ ☆ NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and + Benchmarking NeurIPS 2024 + + +
+ Benchmarking vision-based driving policies is challenging. On one hand, +open-loop evaluation with real data is easy, but these results do not reflect +closed-loop performance. On the other, closed-loop evaluation is possible in +simulation, but is hard to scale due to its significant computational demands. +Further, the simulators available today exhibit a large domain gap to real +data. This has resulted in an inability to draw clear conclusions from the +rapidly growing body of research on end-to-end autonomous driving. In this +paper, we present NAVSIM, a middle ground between these evaluation paradigms, +where we use large datasets in combination with a non-reactive simulator to +enable large-scale real-world benchmarking. Specifically, we gather +simulation-based metrics, such as progress and time to collision, by unrolling +bird's eye view abstractions of the test scenes for a short simulation horizon. +Our simulation is non-reactive, i.e., the evaluated policy and environment do +not influence each other. As we demonstrate empirically, this decoupling allows +open-loop metric computation while being better aligned with closed-loop +evaluations than traditional displacement errors. NAVSIM enabled a new +competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting +in several new insights. On a large set of challenging scenarios, we observe +that simple methods with moderate compute requirements such as TransFuser can +match recent large-scale end-to-end driving architectures such as UniAD. Our +modular framework can potentially be extended with new datasets, data curation +strategies, and metrics, and will be continually maintained to host future +challenges. Our code is available at +https://github.com/autonomousvision/navsim. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Quantized neural network for complex hologram generation + + +
+ Computer-generated holography (CGH) is a promising technology for augmented +reality displays, such as head-mounted or head-up displays. However, its high +computational demand makes it impractical for implementation. Recent efforts to +integrate neural networks into CGH have successfully accelerated computing +speed, demonstrating the potential to overcome the trade-off between +computational cost and image quality. Nevertheless, deploying neural +network-based CGH algorithms on computationally limited embedded systems +requires more efficient models with lower computational cost, memory footprint, +and power consumption. In this study, we developed a lightweight model for +complex hologram generation by introducing neural network quantization. +Specifically, we built a model based on tensor holography and quantized it from +32-bit floating-point precision (FP32) to 8-bit integer precision (INT8). Our +performance evaluation shows that the proposed INT8 model achieves hologram +quality comparable to that of the FP32 model while reducing the model size by +approximately 70% and increasing the speed fourfold. Additionally, we +implemented the INT8 model on a system-on-module to demonstrate its +deployability on embedded platforms and high power efficiency. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise + Optimization NeurIPS 2024 + + +
+ Text-to-Image (T2I) models have made significant advancements in recent +years, but they still struggle to accurately capture intricate details +specified in complex compositional prompts. While fine-tuning T2I models with +reward objectives has shown promise, it suffers from "reward hacking" and may +not generalize well to unseen prompt distributions. In this work, we propose +Reward-based Noise Optimization (ReNO), a novel approach that enhances T2I +models at inference by optimizing the initial noise based on the signal from +one or multiple human preference reward models. Remarkably, solving this +optimization problem with gradient ascent for 50 iterations yields impressive +results on four different one-step models across two competitive benchmarks, +T2I-CompBench and GenEval. Within a computational budget of 20-50 seconds, +ReNO-enhanced one-step models consistently surpass the performance of all +current open-source Text-to-Image models. Extensive user studies demonstrate +that our model is preferred nearly twice as often compared to the popular SDXL +model and is on par with the proprietary Stable Diffusion 3 with 8B parameters. +Moreover, given the same computational resources, a ReNO-optimized one-step +model outperforms widely-used open-source models such as SDXL and +PixArt-$\alpha$, highlighting the efficiency and effectiveness of ReNO in +enhancing T2I model performance at inference time. Code is available at +https://github.com/ExplainableML/ReNO. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Invisible Image Watermarks Are Provably Removable Using Generative AI NeurIPS 2024 + + +
+ Invisible watermarks safeguard images' copyrights by embedding hidden +messages only detectable by owners. They also prevent people from misusing +images, especially those generated by AI models. We propose a family of +regeneration attacks to remove these invisible watermarks. The proposed attack +method first adds random noise to an image to destroy the watermark and then +reconstructs the image. This approach is flexible and can be instantiated with +many existing image-denoising algorithms and pre-trained generative models such +as diffusion models. Through formal proofs and extensive empirical evaluations, +we demonstrate that pixel-level invisible watermarks are vulnerable to this +regeneration attack. Our results reveal that, across four different pixel-level +watermarking schemes, the proposed method consistently achieves superior +performance compared to existing attack techniques, with lower detection rates +and higher image quality. However, watermarks that keep the image semantically +similar can be an alternative defense against our attacks. Our finding +underscores the need for a shift in research/industry emphasis from invisible +watermarks to semantic-preserving watermarks. Code is available at +https://github.com/XuandongZhao/WatermarkAttacker + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ MoVA: Adapting Mixture of Vision Experts to Multimodal Context NeurIPS 2024 + + +
+ As the key component in multimodal large language models (MLLMs), the ability +of the visual encoder greatly affects MLLM's understanding on diverse image +content. Although some large-scale pretrained vision encoders such as vision +encoders in CLIP and DINOv2 have brought promising performance, we found that +there is still no single vision encoder that can dominate various image content +understanding, e.g., the CLIP vision encoder leads to outstanding results on +general image understanding but poor performance on document or chart content. +To alleviate the bias of CLIP vision encoder, we first delve into the inherent +behavior of different pre-trained vision encoders and then propose the MoVA, a +powerful and novel MLLM, adaptively routing and fusing task-specific vision +experts with a coarse-to-fine mechanism. In the coarse-grained stage, we design +a context-aware expert routing strategy to dynamically select the most suitable +vision experts according to the user instruction, input image, and expertise of +vision experts. This benefits from the powerful model function understanding +ability of the large language model (LLM). In the fine-grained stage, we +elaborately conduct the mixture-of-vision-expert adapter (MoV-Adapter) to +extract and fuse task-specific knowledge from various experts. This +coarse-to-fine paradigm effectively leverages representations from experts +based on multimodal context and model expertise, further enhancing the +generalization ability. We conduct extensive experiments to evaluate the +effectiveness of the proposed approach. Without any bells and whistles, MoVA +can achieve significant performance gains over current state-of-the-art methods +in a wide range of challenging multimodal benchmarks. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self + Attention at the Threadblock Level NeurIPS 2024 + + +
+ Neighborhood attention reduces the cost of self attention by restricting each +token's attention span to its nearest neighbors. This restriction, +parameterized by a window size and dilation factor, draws a spectrum of +possible attention patterns between linear projection and self attention. +Neighborhood attention, and more generally sliding window attention patterns, +have long been bounded by infrastructure, particularly in higher-rank spaces +(2-D and 3-D), calling for the development of custom kernels, which have been +limited in either functionality, or performance, if not both. In this work, we +aim to massively improve upon existing infrastructure by providing two new +methods for implementing neighborhood attention. We first show that +neighborhood attention can be represented as a batched GEMM problem, similar to +standard attention, and implement it for 1-D and 2-D neighborhood attention. +These kernels on average provide 895% and 272% improvement in full precision +runtime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood +attention respectively. We find that aside from being heavily bound by memory +bandwidth, certain inherent inefficiencies exist in all unfused implementations +of neighborhood attention, which in most cases undo their theoretical +efficiency gain. Motivated by the progress made into fused dot-product +attention kernels, we developed fused neighborhood attention; an adaptation of +fused dot-product attention kernels that allow fine-grained control over +attention across different spatial axes. Known for reducing the quadratic time +complexity of self attention to a linear complexity, neighborhood attention can +now enjoy a reduced and constant memory footprint, and record-breaking half +precision runtime. We observe that our fused implementation successfully +circumvents some of the unavoidable inefficiencies in unfused +implementations... + +
+
+ comment: To appear in 38th Conference on Neural Information Processing Systems + (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Visual place recognition for aerial imagery: A survey + + +
+ Aerial imagery and its direct application to visual localization is an +essential problem for many Robotics and Computer Vision tasks. While Global +Navigation Satellite Systems (GNSS) are the standard default solution for +solving the aerial localization problem, it is subject to a number of +limitations, such as, signal instability or solution unreliability that make +this option not so desirable. Consequently, visual geolocalization is emerging +as a viable alternative. However, adapting Visual Place Recognition (VPR) task +to aerial imagery presents significant challenges, including weather variations +and repetitive patterns. Current VPR reviews largely neglect the specific +context of aerial data. This paper introduces a methodology tailored for +evaluating VPR techniques specifically in the domain of aerial imagery, +providing a comprehensive assessment of various methods and their performance. +However, we not only compare various VPR methods, but also demonstrate the +importance of selecting appropriate zoom and overlap levels when constructing +map tiles to achieve maximum efficiency of VPR algorithms in the case of aerial +imagery. The code is available on our GitHub repository -- +https://github.com/prime-slam/aero-vloc. + +
+
+
+
+
+ + ♻ ☆ Learning Cooperative Trajectory Representations for Motion Forecasting NeurIPS 2024 + + +
+ Motion forecasting is an essential task for autonomous driving, and utilizing +information from infrastructure and other vehicles can enhance forecasting +capabilities. Existing research mainly focuses on leveraging single-frame +cooperative information to enhance the limited perception capability of the ego +vehicle, while underutilizing the motion and interaction context of traffic +participants observed from cooperative devices. In this paper, we propose a +forecasting-oriented representation paradigm to utilize motion and interaction +features from cooperative information. Specifically, we present V2X-Graph, a +representative framework to achieve interpretable and end-to-end trajectory +feature fusion for cooperative motion forecasting. V2X-Graph is evaluated on +V2X-Seq in vehicle-to-infrastructure (V2I) scenarios. To further evaluate on +vehicle-to-everything (V2X) scenario, we construct the first real-world V2X +motion forecasting dataset V2X-Traj, which contains multiple autonomous +vehicles and infrastructure in every scenario. Experimental results on both +V2X-Seq and V2X-Traj show the advantage of our method. We hope both V2X-Graph +and V2X-Traj will benefit the further development of cooperative motion +forecasting. Find the project at https://github.com/AIR-THU/V2X-Graph. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ De-Confusing Pseudo-Labels in Source-Free Domain Adaptation + + +
+ Source-free domain adaptation aims to adapt a source-trained model to an +unlabeled target domain without access to the source data. It has attracted +growing attention in recent years, where existing approaches focus on +self-training that usually includes pseudo-labeling techniques. In this paper, +we introduce a novel noise-learning approach tailored to address noise +distribution in domain adaptation settings and learn to de-confuse the +pseudo-labels. More specifically, we learn a noise transition matrix of the +pseudo-labels to capture the label corruption of each class and learn the +underlying true label distribution. Estimating the noise transition matrix +enables a better true class-posterior estimation, resulting in better +prediction accuracy. We demonstrate the effectiveness of our approach when +combined with several source-free domain adaptation methods: SHOT, SHOT++, and +AaD. We obtain state-of-the-art results on three domain adaptation datasets: +VisDA, DomainNet, and OfficeHome. + +
+
+
+
+
+ + ♻ ☆ Measuring Sound Symbolism in Audio-visual Models + + +
+ Audio-visual pre-trained models have gained substantial attention recently +and demonstrated superior performance on various audio-visual tasks. This study +investigates whether pre-trained audio-visual models demonstrate non-arbitrary +associations between sounds and visual representations$\unicode{x2013}$known as +sound symbolism$\unicode{x2013}$which is also observed in humans. We developed +a specialized dataset with synthesized images and audio samples and assessed +these models using a non-parametric approach in a zero-shot setting. Our +findings reveal a significant correlation between the models' outputs and +established patterns of sound symbolism, particularly in models trained on +speech data. These results suggest that such models can capture sound-meaning +connections akin to human language processing, providing insights into both +cognitive architectures and machine learning strategies. + +
+
+ comment: Errors in the introduction part that might potentially affect the + integrity of the paper. Withdraw at the point. Will replace with an updated + version in the future +
+
+
+
+
+ + ♻ ☆ Text-Aware Diffusion for Policy Learning + + +
+ Training an agent to achieve particular goals or perform desired behaviors is +often accomplished through reinforcement learning, especially in the absence of +expert demonstrations. However, supporting novel goals or behaviors through +reinforcement learning requires the ad-hoc design of appropriate reward +functions, which quickly becomes intractable. To address this challenge, we +propose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a +pretrained, frozen text-conditioned diffusion model to compute dense zero-shot +reward signals for text-aligned policy learning. We hypothesize that +large-scale pretrained generative models encode rich priors that can supervise +a policy to behave not only in a text-aligned manner, but also in alignment +with a notion of naturalness summarized from internet-scale training data. In +our experiments, we demonstrate that TADPoLe is able to learn policies for +novel goal-achievement and continuous locomotion behaviors specified by natural +language, in both Humanoid and Dog environments. The behaviors are learned +zero-shot without ground-truth rewards or expert demonstrations, and are +qualitatively more natural according to human evaluation. We further show that +TADPoLe performs competitively when applied to robotic manipulation tasks in +the Meta-World environment, without having access to any in-domain +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Adversarial Score identity Distillation: Rapidly Surpassing the Teacher + in One Step + + +
+ Score identity Distillation (SiD) is a data-free method that has achieved +state-of-the-art performance in image generation by leveraging only a +pretrained diffusion model, without requiring any training data. However, the +ultimate performance of SiD is constrained by the accuracy with which the +pretrained model captures the true data scores at different stages of the +diffusion process. In this paper, we introduce SiDA (SiD with Adversarial +Loss), which not only enhances generation quality but also improves +distillation efficiency by incorporating real images and adversarial loss. SiDA +utilizes the encoder from the generator's score network as a discriminator, +boosting its ability to distinguish between real images and those generated by +SiD. The adversarial loss is batch-normalized within each GPU and then combined +with the original SiD loss. This integration effectively incorporates the +average "fakeness" per GPU batch into the pixel-based SiD loss, enabling SiDA +to distill a single-step generator either from scratch or by fine-tuning an +existing one. SiDA converges significantly faster than its predecessor when +trained from scratch, and swiftly improves upon the original model's +performance after an initial warmup period during fine-tuning from a +pre-distilled SiD generator. This one-step adversarial distillation method +establishes new benchmarks in generation performance when distilling EDM +diffusion models pretrained on CIFAR-10 (32x32) and ImageNet (64x64), achieving +FID score of 1.110 on ImageNet 64x64. It sets record-low FID scores when +distilling EDM2 models trained on ImageNet (512x512), surpassing even the +largest teacher model, EDM2-XXL. Our SiDA's results record FID scores of 2.156 +for EDM2-XS, 1.669 for EDM2-S, 1.488 for EDM2-M, and 1.465 for EDM2-L, +demonstrating significant improvements across all model sizes. Our open-source +code will be integrated into the SiD codebase. + +
+
+
+
+
+ + ♻ ☆ EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver + Attention Estimation + + +
+ Associating driver attention with driving scene across two fields of views +(FOVs) is a hard cross-domain perception problem, which requires comprehensive +consideration of cross-view mapping, dynamic driving scene analysis, and driver +status tracking. Previous methods typically focus on a single view or map +attention to the scene via estimated gaze, failing to exploit the implicit +connection between them. Moreover, simple fusion modules are insufficient for +modeling the complex relationships between the two views, making information +integration challenging. To address these issues, we propose a novel method for +end-to-end scene-associated driver attention estimation, called EraW-Net. This +method enhances the most discriminative dynamic cues, refines feature +representations, and facilitates semantically aligned cross-domain integration +through a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive +Filter Module (DAF-Module) is proposed to address the challenges of frequently +changing driving environments by extracting vital regions. It suppresses the +indiscriminately recorded dynamics and highlights crucial ones by innovative +joint frequency-spatial analysis, enhancing the model's ability to parse +complex dynamics. Additionally, to track driver states during non-fixed facial +poses, we propose a Global Context Sharing Module (GCS-Module) to construct +refined feature representations by capturing hierarchical features that adapt +to various scales of head and eye movements. Finally, W-Net achieves systematic +cross-view information integration through its "Encoding-Independent Partial +Decoding-Fusion Decoding" structure, addressing semantic misalignment in +heterogeneous data integration. Experiments demonstrate that the proposed +method robustly and accurately estimates the mapping of driver attention in +scene on large public datasets. + +
+
+ comment: 13pages, 9 figures +
+
+
+
+
+ + ♻ ☆ FairSkin: Fair Diffusion for Skin Disease Image Generation + + +
+ Image generation is a prevailing technique for clinical data augmentation for +advancing diagnostic accuracy and reducing healthcare disparities. Diffusion +Model (DM) has become a leading method in generating synthetic medical images, +but it suffers from a critical twofold bias: (1) The quality of images +generated for Caucasian individuals is significantly higher, as measured by the +Frechet Inception Distance (FID). (2) The ability of the downstream-task +learner to learn critical features from disease images varies across different +skin tones. These biases pose significant risks, particularly in skin disease +detection, where underrepresentation of certain skin tones can lead to +misdiagnosis or neglect of specific conditions. To address these challenges, we +propose FairSkin, a novel DM framework that mitigates these biases through a +three-level resampling mechanism, ensuring fairer representation across racial +and disease categories. Our approach significantly improves the diversity and +quality of generated images, contributing to more equitable skin disease +detection in clinical settings. + +
+
+
+
+
+ + ♻ ☆ Low-light Pedestrian Detection in Visible and Infrared Image Feeds: + Issues and Challenges + + +
+ Pedestrian detection has become a cornerstone for several high-level tasks, +including autonomous driving, intelligent transportation, and traffic +surveillance. There are several works focussed on pedestrian detection using +visible images, mainly in the daytime. However, this task is very intriguing +when the environmental conditions change to poor lighting or nighttime. +Recently, new ideas have been spurred to use alternative sources, such as Far +InfraRed (FIR) temperature sensor feeds for detecting pedestrians in low-light +conditions. This study reviews recent developments in low-light pedestrian +detection approaches. It systematically categorizes and analyses various +algorithms from region-based to non-region-based and graph-based learning +methodologies by highlighting their methodologies, implementation issues, and +challenges. It also outlines the key benchmark datasets that can be used for +research and development of advanced pedestrian detection algorithms, +particularly in low-light situations. + +
+
+
+
+
+ + ♻ ☆ UDHF2-Net: Uncertainty-diffusion-model-based High-Frequency TransFormer + Network for Remotely Sensed Imagery Interpretation + + +
+ Remotely sensed imagery interpretation (RSII) faces the three major problems: +(1) objective representation of spatial distribution patterns; (2) edge +uncertainty problem caused by downsampling encoder and intrinsic edge noises +(e.g., mixed pixel and edge occlusion etc.); and (3) false detection problem +caused by geometric registration error in change detection. To solve the +aforementioned problems, uncertainty-diffusion-model-based high-Frequency +TransFormer network (UDHF2-Net) is the first to be proposed, whose +superiorities are as follows: (1) a spatially-stationary-and-non-stationary +high-frequency connection paradigm (SHCP) is proposed to enhance the +interaction of spatially frequency-wise stationary and non-stationary features +to yield high-fidelity edge extraction result. Inspired by HRFormer, SHCP +proposes high-frequency-wise stream to replace high-resolution-wise stream in +HRFormer through the whole encoder-decoder process with parallel frequency-wise +high-to-low streams, so it improves the edge extraction accuracy by +continuously remaining high-frequency information; (2) a +mask-and-geo-knowledge-based uncertainty diffusion module (MUDM), which is a +self-supervised learning strategy, is proposed to improve the edge accuracy of +extraction and change detection by gradually removing the simulated spectrum +noises based on geo-knowledge and the generated diffused spectrum noises; (3) a +frequency-wise semi-pseudo-Siamese UDHF2-Net is the first to be proposed to +balance accuracy and complexity for change detection. Besides the +aforementioned spectrum noises in semantic segmentation, MUDM is also a +self-supervised learning strategy to effectively reduce the edge false change +detection from the generated imagery with geometric registration error. + +
+
+
+
+
+ + ♻ ☆ SR-CACO-2: A Dataset for Confocal Fluorescence Microscopy Image + Super-Resolution NeurIPS 2024 + + +
+ Confocal fluorescence microscopy is one of the most accessible and widely +used imaging techniques for the study of biological processes at the cellular +and subcellular levels. Scanning confocal microscopy allows the capture of +high-quality images from thick three-dimensional (3D) samples, yet suffers from +well-known limitations such as photobleaching and phototoxicity of specimens +caused by intense light exposure, limiting its applications. Cellular damage +can be alleviated by changing imaging parameters to reduce light exposure, +often at the expense of image quality. Machine/deep learning methods for +single-image super-resolution (SISR) can be applied to restore image quality by +upscaling lower-resolution (LR) images to yield high-resolution images (HR). +These SISR methods have been successfully applied to photo-realistic images due +partly to the abundance of publicly available data. In contrast, the lack of +publicly available data partly limits their application and success in scanning +confocal microscopy. In this paper, we introduce a large scanning confocal +microscopy dataset named SR-CACO-2 that is comprised of low- and +high-resolution image pairs marked for three different fluorescent markers. It +allows the evaluation of performance of SISR methods on three different +upscaling levels (X2, X4, X8). SR-CACO-2 contains the human epithelial cell +line Caco-2 (ATCC HTB-37), and it is composed of 2,200 unique images, captured +with four resolutions and three markers, forming 9,937 image patches for SISR +methods. We provide benchmarking results for 16 state-of-the-art methods of the +main SISR families. Results show that these methods have limited success in +producing high-resolution textures. The dataset is freely accessible under a +Creative Commons license (CC BY-NC-SA 4.0). Our dataset, code and pretrained +weights for SISR methods are available: https://github.com/sbelharbi/sr-caco-2. + +
+
+ comment: 27 pages, 15 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Eddeep: Fast eddy-current distortion correction for diffusion MRI with + deep learning MICCAI 2024 + + +
+ Modern diffusion MRI sequences commonly acquire a large number of volumes +with diffusion sensitization gradients of differing strengths or directions. +Such sequences rely on echo-planar imaging (EPI) to achieve reasonable scan +duration. However, EPI is vulnerable to off-resonance effects, leading to +tissue susceptibility and eddy-current induced distortions. The latter is +particularly problematic because it causes misalignment between volumes, +disrupting downstream modelling and analysis. The essential correction of eddy +distortions is typically done post-acquisition, with image registration. +However, this is non-trivial because correspondence between volumes can be +severely disrupted due to volume-specific signal attenuations induced by +varying directions and strengths of the applied gradients. This challenge has +been successfully addressed by the popular FSL~Eddy tool but at considerable +computational cost. We propose an alternative approach, leveraging recent +advances in image processing enabled by deep learning (DL). It consists of two +convolutional neural networks: 1) An image translator to restore correspondence +between images; 2) A registration model to align the translated images. Results +demonstrate comparable distortion estimates to FSL~Eddy, while requiring only +modest training sample sizes. This work, to the best of our knowledge, is the +first to tackle this problem with deep learning. Together with recently +developed DL-based susceptibility correction techniques, they pave the way for +real-time preprocessing of diffusion MRI, facilitating its wider uptake in the +clinic. + +
+
+ comment: accepted in MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map + Generation ICRA 2024 + + +
+ High-definition (HD) semantic map generation of the environment is an +essential component of autonomous driving. Existing methods have achieved good +performance in this task by fusing different sensor modalities, such as LiDAR +and camera. However, current works are based on raw data or network +feature-level fusion and only consider short-range HD map generation, limiting +their deployment to realistic autonomous driving applications. In this paper, +we focus on the task of building the HD maps in both short ranges, i.e., within +30 m, and also predicting long-range HD maps up to 90 m, which is required by +downstream path planning and control tasks to improve the smoothness and safety +of autonomous driving. To this end, we propose a novel network named +SuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels. +We use LiDAR depth to improve image depth estimation and use image features to +guide long-range LiDAR feature prediction. We benchmark our SuperFusion on the +nuScenes dataset and a self-recorded dataset and show that it outperforms the +state-of-the-art baseline methods with large margins on all intervals. +Additionally, we apply the generated HD map to a downstream path planning task, +demonstrating that the long-range HD maps predicted by our method can lead to +better path planning for autonomous vehicles. Our code has been released at +https://github.com/haomo-ai/SuperFusion. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ♻ ☆ CoMix: A Comprehensive Benchmark for Multi-Task Comic Understanding NeurIPS 2024 + + +
+ The comic domain is rapidly advancing with the development of single-page +analysis and synthesis models. However, evaluation metrics and datasets lag +behind, often limited to small-scale or single-style test sets. We introduce a +novel benchmark, CoMix, designed to evaluate the multi-task capabilities of +models in comic analysis. Unlike existing benchmarks that focus on isolated +tasks such as object detection or text recognition, CoMix addresses a broader +range of tasks including object detection, speaker identification, character +re-identification, reading order, and multi-modal reasoning tasks like +character naming and dialogue generation. Our benchmark comprises three +existing datasets with expanded annotations to support multi-task evaluation. +To mitigate the over-representation of manga-style data, we have incorporated a +new dataset of carefully selected American comic-style books, thereby enriching +the diversity of comic styles. CoMix is designed to assess pre-trained models +in zero-shot and limited fine-tuning settings, probing their transfer +capabilities across different comic styles and tasks. The validation split of +the benchmark is publicly available for research purposes, and an evaluation +server for the held-out test split is also provided. Comparative results +between human performance and state-of-the-art models reveal a significant +performance gap, highlighting substantial opportunities for advancements in +comic understanding. The dataset, baseline models, and code are accessible at +https://github.com/emanuelevivoli/CoMix-dataset. This initiative sets a new +standard for comprehensive comic analysis, providing the community with a +common benchmark for evaluation on a large and varied set. + +
+
+ comment: Accepted at NeurIPS 2024 (D&B) +
+
+
+
+
+ + ♻ ☆ SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance + Fields + + +
+ Although significant progress has been made in the field of 2D-based +interactive editing, fine-grained 3D-based interactive editing remains +relatively unexplored. This limitation can be attributed to two main +challenges: the lack of an efficient 3D representation robust to different +modifications and the absence of an effective 3D interactive segmentation +method. In this paper, we introduce a novel fine-grained interactive 3D +segmentation and editing algorithm with radiance fields, which we refer to as +SERF. Our method entails creating a neural mesh representation by integrating +multi-view algorithms with pre-trained 2D models. Building upon this +representation, we introduce a novel surface rendering technique that preserves +local information and is robust to deformation. Moreover, this representation +forms the basis for achieving accurate and interactive 3D segmentation without +requiring 3D supervision. Harnessing this representation facilitates a range of +interactive 3D editing operations, encompassing tasks such as interactive +geometry editing and texture painting. Extensive experiments and visualization +examples of editing on both real and synthetic data demonstrate the superiority +of our method on representation quality and editing ability. + +
+
+
+
+
+ + ♻ ☆ CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable + Remote Sensing Semantic Segmentation + + +
+ The field of Remote Sensing Domain Generalization (RSDG) has emerged as a +critical and valuable research frontier, focusing on developing models that +generalize effectively across diverse scenarios. Despite the substantial domain +gaps in RS images that are characterized by variabilities such as location, +wavelength, and sensor type, research in this area remains underexplored: (1) +Current cross-domain methods primarily focus on Domain Adaptation (DA), which +adapts models to predefined domains rather than to unseen ones; (2) Few studies +targeting the RSDG issue, especially for semantic segmentation tasks, where +existing models are developed for specific unknown domains, struggling with +issues of underfitting on other unknown scenarios; (3) Existing RS foundation +models tend to prioritize in-domain performance over cross-domain +generalization. To this end, we introduce the first vision foundation model for +RSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong +cross-domain generalization through a specially designed data-level Earth-Style +Injection pipeline and a model-level Multi-Task Training pipeline. In addition, +for the semantic segmentation task, we have curated an RSDG benchmark +comprising 28 cross-domain settings across various regions, spectral bands, +platforms, and climates, providing a comprehensive framework for testing the +generalizability of future RSDG models. Extensive experiments on this benchmark +demonstrate the superiority of CrossEarth over existing state-of-the-art +methods. + +
+
+ comment: The codes and models will be available at + https://github.com/Cuzyoung/CrossEarth +
+
+
+
+
+ + ♻ ☆ Embracing Events and Frames with Hierarchical Feature Refinement Network + for Object Detection ECCV 2024 + + +
+ In frame-based vision, object detection faces substantial performance +degradation under challenging conditions due to the limited sensing capability +of conventional cameras. Event cameras output sparse and asynchronous events, +providing a potential solution to solve these problems. However, effectively +fusing two heterogeneous modalities remains an open issue. In this work, we +propose a novel hierarchical feature refinement network for event-frame fusion. +The core concept is the design of the coarse-to-fine fusion module, denoted as +the cross-modality adaptive feature refinement (CAFR) module. In the initial +phase, the bidirectional cross-modality interaction (BCI) part facilitates +information bridging from two distinct sources. Subsequently, the features are +further refined by aligning the channel-level mean and variance in the two-fold +adaptive feature refinement (TAFR) part. We conducted extensive experiments on +two benchmarks: the low-resolution PKU-DDD17-Car dataset and the +high-resolution DSEC dataset. Experimental results show that our method +surpasses the state-of-the-art by an impressive margin of $\textbf{8.0}\%$ on +the DSEC dataset. Besides, our method exhibits significantly better robustness +(\textbf{69.5}\% versus \textbf{38.7}\%) when introducing 15 different +corruption types to the frame images. The code can be found at the link +(https://github.com/HuCaoFighting/FRN). + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Consistency Diffusion Bridge Models NeurIPS 2024 + + +
+ Diffusion models (DMs) have become the dominant paradigm of generative +modeling in a variety of domains by learning stochastic processes from noise to +data. Recently, diffusion denoising bridge models (DDBMs), a new formulation of +generative modeling that builds stochastic processes between fixed data +endpoints based on a reference diffusion process, have achieved empirical +success across tasks with coupled data distribution, such as image-to-image +translation. However, DDBM's sampling process typically requires hundreds of +network evaluations to achieve decent performance, which may impede their +practical deployment due to high computational demands. In this work, inspired +by the recent advance of consistency models in DMs, we tackle this problem by +learning the consistency function of the probability-flow ordinary differential +equation (PF-ODE) of DDBMs, which directly predicts the solution at a starting +step given any point on the ODE trajectory. Based on a dedicated general-form +ODE solver, we propose two paradigms: consistency bridge distillation and +consistency bridge training, which is flexible to apply on DDBMs with broad +design choices. Experimental results show that our proposed method could sample +$4\times$ to $50\times$ faster than the base DDBM and produce better visual +quality given the same step in various tasks with pixel resolution ranging from +$64 \times 64$ to $256 \times 256$, as well as supporting downstream tasks such +as semantic interpolation in the data space. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Dessie: Disentanglement for Articulated 3D Horse Shape and Pose + Estimation from Images ACCV2024 + + +
+ In recent years, 3D parametric animal models have been developed to aid in +estimating 3D shape and pose from images and video. While progress has been +made for humans, it's more challenging for animals due to limited annotated +data. To address this, we introduce the first method using synthetic data +generation and disentanglement to learn to regress 3D shape and pose. Focusing +on horses, we use text-based texture generation and a synthetic data pipeline +to create varied shapes, poses, and appearances, learning disentangled spaces. +Our method, Dessie, surpasses existing 3D horse reconstruction methods and +generalizes to other large animals like zebras, cows, and deer. See the project +website at: \url{https://celiali.github.io/Dessie/}. + +
+
+ comment: ACCV2024 +
+
+
+
+
+ + ♻ ☆ M3LEO: A Multi-Modal, Multi-Label Earth Observation Dataset Integrating + Interferometric SAR and Multispectral Data + + +
+ Satellite-based remote sensing has revolutionised the way we address global +challenges. Huge quantities of Earth Observation (EO) data are generated by +satellite sensors daily, but processing these large datasets for use in ML +pipelines is technically and computationally challenging. While some +preprocessed Earth observation datasets exist, their content is often limited +to optical or near-optical wavelength data, which is ineffective at night or in +adverse weather conditions. Synthetic Aperture Radar (SAR), an active sensing +technique based on microwave length radiation, offers a viable alternative. +However, the application of machine learning to SAR has been limited due to a +lack of ML-ready data and pipelines, particularly for the full diversity of SAR +data, including polarimetry, coherence and interferometry. In this work, we +introduce M3LEO, a multi-modal, multi-label Earth observation dataset that +includes polarimetric, interferometric, and coherence SAR data derived from +Sentinel-1, alongside multispectral Sentinel-2 imagery and auxiliary data +describing terrain properties such as land use. M3LEO spans approximately 17M +4x4 km data chips from six diverse geographic regions. The dataset is +complemented by a flexible PyTorch Lightning framework configured using Hydra +to accommodate its use across diverse ML applications in Earth observation. We +provide tools to process any dataset available on popular platforms such as +Google Earth Engine for seamless integration with our framework. We show that +the distribution shift in self-supervised embeddings is substantial across +geographic regions, even when controlling for terrain properties. Data: +huggingface.co/M3LEO, Code: github.com/spaceml-org/M3LEO. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Blind Inpainting with Object-aware Discrimination for Artificial Marker + Removal + + +
+ Medical images often incorporate doctor-added markers that can hinder +AI-based diagnosis. This issue highlights the need of inpainting techniques to +restore the corrupted visual contents. However, existing methods require manual +mask annotation as input, limiting the application scenarios. In this paper, we +propose a novel blind inpainting method that automatically reconstructs visual +contents within the corrupted regions without mask input as guidance. Our model +includes a blind reconstruction network and an object-aware discriminator for +adversarial training. The reconstruction network contains two branches that +predict corrupted regions in images and simultaneously restore the missing +visual contents. Leveraging the potent recognition capability of a dense object +detector, the object-aware discriminator ensures markers undetectable after +inpainting. Thus, the restored images closely resemble the clean ones. We +evaluate our method on three datasets of various medical imaging modalities, +confirming better performance over other state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Advancing Video Anomaly Detection: A Concise Review and a New Dataset NeurIPS 2024 + + +
+ Video Anomaly Detection (VAD) finds widespread applications in security +surveillance, traffic monitoring, industrial monitoring, and healthcare. +Despite extensive research efforts, there remains a lack of concise reviews +that provide insightful guidance for researchers. Such reviews would serve as +quick references to grasp current challenges, research trends, and future +directions. In this paper, we present such a review, examining models and +datasets from various perspectives. We emphasize the critical relationship +between model and dataset, where the quality and diversity of datasets +profoundly influence model performance, and dataset development adapts to the +evolving needs of emerging approaches. Our review identifies practical issues, +including the absence of comprehensive datasets with diverse scenarios. To +address this, we introduce a new dataset, Multi-Scenario Anomaly Detection +(MSAD), comprising 14 distinct scenarios captured from various camera views. +Our dataset has diverse motion patterns and challenging variations, such as +different lighting and weather conditions, providing a robust foundation for +training superior models. We conduct an in-depth analysis of recent +representative models using MSAD and highlight its potential in addressing the +challenges of detecting anomalies across diverse and evolving surveillance +scenarios. [Project website: https://msad-dataset.github.io/] + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ WildGaussians: 3D Gaussian Splatting in the Wild NeurIPS 2024 + + +
+ While the field of 3D scene reconstruction is dominated by NeRFs due to their +photorealistic quality, 3D Gaussian Splatting (3DGS) has recently emerged, +offering similar quality with real-time rendering speeds. However, both methods +primarily excel with well-controlled 3D scenes, while in-the-wild data - +characterized by occlusions, dynamic objects, and varying illumination - +remains challenging. NeRFs can adapt to such conditions easily through +per-image embedding vectors, but 3DGS struggles due to its explicit +representation and lack of shared parameters. To address this, we introduce +WildGaussians, a novel approach to handle occlusions and appearance changes +with 3DGS. By leveraging robust DINO features and integrating an appearance +modeling module within 3DGS, our method achieves state-of-the-art results. We +demonstrate that WildGaussians matches the real-time rendering speed of 3DGS +while surpassing both 3DGS and NeRF baselines in handling in-the-wild data, all +within a simple architectural framework. + +
+
+ comment: NeurIPS 2024; Project page: https://wild-gaussians.github.io/ +
+
+
+
+
+ + ♻ ☆ FasterDiT: Towards Faster Diffusion Transformers Training without + Architecture Modification NeurIPS 2024 + + +
+ Diffusion Transformers (DiT) have attracted significant attention in +research. However, they suffer from a slow convergence rate. In this paper, we +aim to accelerate DiT training without any architectural modification. We +identify the following issues in the training process: firstly, certain +training strategies do not consistently perform well across different data. +Secondly, the effectiveness of supervision at specific timesteps is limited. In +response, we propose the following contributions: (1) We introduce a new +perspective for interpreting the failure of the strategies. Specifically, we +slightly extend the definition of Signal-to-Noise Ratio (SNR) and suggest +observing the Probability Density Function (PDF) of SNR to understand the +essence of the data robustness of the strategy. (2) We conduct numerous +experiments and report over one hundred experimental results to empirically +summarize a unified accelerating strategy from the perspective of PDF. (3) We +develop a new supervision method that further accelerates the training process +of DiT. Based on them, we propose FasterDiT, an exceedingly simple and +practicable design strategy. With few lines of code modifications, it achieves +2.30 FID on ImageNet 256 resolution at 1000k iterations, which is comparable to +DiT (2.27 FID) but 7 times faster in training. + +
+
+ comment: NeurIPS 2024 (poster); update to camera-ready version +
+
+
+
+
+ + ♻ ☆ Rethinking Out-of-Distribution Detection on Imbalanced Data Distribution NeurIPS 2024 + + +
+ Detecting and rejecting unknown out-of-distribution (OOD) samples is critical +for deployed neural networks to void unreliable predictions. In real-world +scenarios, however, the efficacy of existing OOD detection methods is often +impeded by the inherent imbalance of in-distribution (ID) data, which causes +significant performance decline. Through statistical observations, we have +identified two common challenges faced by different OOD detectors: +misidentifying tail class ID samples as OOD, while erroneously predicting OOD +samples as head class from ID. To explain this phenomenon, we introduce a +generalized statistical framework, termed ImOOD, to formulate the OOD detection +problem on imbalanced data distribution. Consequently, the theoretical analysis +reveals that there exists a class-aware bias item between balanced and +imbalanced OOD detection, which contributes to the performance gap. Building +upon this finding, we present a unified training-time regularization technique +to mitigate the bias and boost imbalanced OOD detectors across architecture +designs. Our theoretically grounded method translates into consistent +improvements on the representative CIFAR10-LT, CIFAR100-LT, and ImageNet-LT +benchmarks against several state-of-the-art OOD detection approaches. Code is +available at https://github.com/alibaba/imood. + +
+
+ comment: This paper has been accepted by NeurIPS 2024. Code is available at + https://github.com/alibaba/imood +
+
+
+
+
+ + ♻ ☆ Subsurface Scattering for 3D Gaussian Splatting + + +
+ 3D reconstruction and relighting of objects made from scattering materials +present a significant challenge due to the complex light transport beneath the +surface. 3D Gaussian Splatting introduced high-quality novel view synthesis at +real-time speeds. While 3D Gaussians efficiently approximate an object's +surface, they fail to capture the volumetric properties of subsurface +scattering. We propose a framework for optimizing an object's shape together +with the radiance transfer field given multi-view OLAT (one light at a time) +data. Our method decomposes the scene into an explicit surface represented as +3D Gaussians, with a spatially varying BRDF, and an implicit volumetric +representation of the scattering component. A learned incident light field +accounts for shadowing. We optimize all parameters jointly via ray-traced +differentiable rendering. Our approach enables material editing, relighting and +novel view synthesis at interactive rates. We show successful application on +synthetic data and introduce a newly acquired multi-view multi-light dataset of +objects in a light-stage setup. Compared to previous work we achieve comparable +or better results at a fraction of optimization and rendering time while +enabling detailed control over material attributes. Project page +https://sss.jdihlmann.com/ + +
+
+ comment: Project page: https://sss.jdihlmann.com/ +
+
+
+
+
+ + ♻ ☆ UNION: Unsupervised 3D Object Detection using Object Appearance-based + Pseudo-Classes NeurIPS'24 + + +
+ Unsupervised 3D object detection methods have emerged to leverage vast +amounts of data without requiring manual labels for training. Recent approaches +rely on dynamic objects for learning to detect mobile objects but penalize the +detections of static instances during training. Multiple rounds of (self) +training are used to add detected static instances to the set of training +targets; this procedure to improve performance is computationally expensive. To +address this, we propose the method UNION. We use spatial clustering and +self-supervised scene flow to obtain a set of static and dynamic object +proposals from LiDAR. Subsequently, object proposals' visual appearances are +encoded to distinguish static objects in the foreground and background by +selecting static instances that are visually similar to dynamic objects. As a +result, static and dynamic mobile objects are obtained together, and existing +detectors can be trained with a single training. In addition, we extend 3D +object discovery to detection by using object appearance-based cluster labels +as pseudo-class labels for training object classification. We conduct extensive +experiments on the nuScenes dataset and increase the state-of-the-art +performance for unsupervised 3D object discovery, i.e. UNION more than doubles +the average precision to 38.4. The code is available at +github.com/TedLentsch/UNION. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ NM-FlowGAN: Modeling sRGB Noise without Paired Images using a Hybrid + Approach of Normalizing Flows and GAN + + +
+ Modeling and synthesizing real sRGB noise is crucial for various low-level +vision tasks, such as building datasets for training image denoising systems. +The distribution of real sRGB noise is highly complex and affected by a +multitude of factors, making its accurate modeling extremely challenging. +Therefore, recent studies have proposed methods that employ data-driven +generative models, such as Generative Adversarial Networks (GAN) and +Normalizing Flows. These studies achieve more accurate modeling of sRGB noise +compared to traditional noise modeling methods. However, there are performance +limitations due to the inherent characteristics of each generative model. To +address this issue, we propose NM-FlowGAN, a hybrid approach that exploits the +strengths of both GAN and Normalizing Flows. We combine pixel-wise noise +modeling networks based on Normalizing Flows and spatial correlation modeling +networks based on GAN. Specifically, the pixel-wise noise modeling network +leverages the high training stability of Normalizing Flows to capture noise +characteristics that are affected by a multitude of factors, and the spatial +correlation networks efficiently model pixel-to-pixel relationships. In +particular, unlike recent methods that rely on paired noisy images, our method +synthesizes noise using clean images and factors that affect noise +characteristics, such as easily obtainable parameters like camera type and ISO +settings, making it applicable to various fields where obtaining noisy-clean +image pairs is not feasible. In our experiments, our NM-FlowGAN outperforms +other baselines in the sRGB noise synthesis task. Moreover, the denoising +neural network trained with synthesized image pairs from our model shows +superior performance compared to other baselines. Our code is available at: +\url{https://github.com/YoungJooHan/NM-FlowGAN}. + +
+
+ comment: 13 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ PuLID: Pure and Lightning ID Customization via Contrastive Alignment NeurIPS 2024 + + +
+ We propose Pure and Lightning ID customization (PuLID), a novel tuning-free +ID customization method for text-to-image generation. By incorporating a +Lightning T2I branch with a standard diffusion one, PuLID introduces both +contrastive alignment loss and accurate ID loss, minimizing disruption to the +original model and ensuring high ID fidelity. Experiments show that PuLID +achieves superior performance in both ID fidelity and editability. Another +attractive property of PuLID is that the image elements (e.g., background, +lighting, composition, and style) before and after the ID insertion are kept as +consistent as possible. Codes and models are available at +https://github.com/ToTheBeginning/PuLID + +
+
+ comment: NeurIPS 2024. Codes and models are available at + https://github.com/ToTheBeginning/PuLID +
+
+
+
+
+ + ♻ ☆ MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN + Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation + + +
+ Segmenting anatomical structures and lesions from ultrasound images +contributes to disease assessment. Weakly supervised learning (WSL) based on +sparse annotation has achieved encouraging performance and demonstrated the +potential to reduce annotation costs. This study attempts to introduce +scribble-based WSL into ultrasound image segmentation tasks. However, +ultrasound images often suffer from poor contrast and unclear edges, coupled +with insufficient supervison signals for edges, posing challenges to edge +prediction. Uncertainty modeling has been proven to facilitate models in +dealing with these issues. Nevertheless, existing uncertainty estimation +paradigms are not robust enough and often filter out predictions near decision +boundaries, resulting in unstable edge predictions. Therefore, we propose +leveraging predictions near decision boundaries effectively. Specifically, we +introduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided +Consistency strategy. This strategy utilizes high-evidence predictions, which +are more likely to occur near high-density regions, to guide the optimization +of low-evidence predictions that may appear near decision boundaries. +Furthermore, the diverse sizes and locations of lesions in ultrasound images +pose a challenge for CNNs with local receptive fields, as they struggle to +model global information. Therefore, we introduce Visual Mamba based on +structured state space sequence models, which achieves long-range dependency +with linear computational complexity, and we construct a novel hybrid CNN-Mamba +framework. During training, the collaboration between the CNN branch and the +Mamba branch in the proposed framework draws inspiration from each other based +on the EGC strategy. Experiments demonstrate the competitiveness of the +proposed method. Dataset and code will be available on +https://github.com/GtLinyer/MambaEviScrib. + +
+
+
+
+
+ + ♻ ☆ Stabilize the Latent Space for Image Autoregressive Modeling: A Unified + Perspective NeurIPS 2024 + + +
+ Latent-based image generative models, such as Latent Diffusion Models (LDMs) +and Mask Image Models (MIMs), have achieved notable success in image generation +tasks. These models typically leverage reconstructive autoencoders like VQGAN +or VAE to encode pixels into a more compact latent space and learn the data +distribution in the latent space instead of directly from pixels. However, this +practice raises a pertinent question: Is it truly the optimal choice? In +response, we begin with an intriguing observation: despite sharing the same +latent space, autoregressive models significantly lag behind LDMs and MIMs in +image generation. This finding contrasts sharply with the field of NLP, where +the autoregressive model GPT has established a commanding presence. To address +this discrepancy, we introduce a unified perspective on the relationship +between latent space and generative models, emphasizing the stability of latent +space in image generative modeling. Furthermore, we propose a simple but +effective discrete image tokenizer to stabilize the latent space for image +generative modeling by applying K-Means on the latent features of +self-supervised learning models. Experimental results show that image +autoregressive modeling with our tokenizer (DiGIT) benefits both image +understanding and image generation with the next token prediction principle, +which is inherently straightforward for GPT models but challenging for other +generative models. Remarkably, for the first time, a GPT-style autoregressive +model for images outperforms LDMs, which also exhibits substantial improvement +akin to GPT when scaling up model size. Our findings underscore the potential +of an optimized latent space and the integration of discrete tokenization in +advancing the capabilities of image generative models. The code is available at +\url{https://github.com/DAMO-NLP-SG/DiGIT}. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ GIC: Gaussian-Informed Continuum for Physical Property Identification + and Simulation NeurIPS 2024 + + +
+ This paper studies the problem of estimating physical properties (system +identification) through visual observations. To facilitate geometry-aware +guidance in physical property estimation, we introduce a novel hybrid framework +that leverages 3D Gaussian representation to not only capture explicit shapes +but also enable the simulated continuum to render object masks as 2D shape +surrogates during training. We propose a new dynamic 3D Gaussian framework +based on motion factorization to recover the object as 3D Gaussian point sets +across different time states. Furthermore, we develop a coarse-to-fine filling +strategy to generate the density fields of the object from the Gaussian +reconstruction, allowing for the extraction of object continuums along with +their surfaces and the integration of Gaussian attributes into these continuum. +In addition to the extracted object surfaces, the Gaussian-informed continuum +also enables the rendering of object masks during simulations, serving as +2D-shape guidance for physical property estimation. Extensive experimental +evaluations demonstrate that our pipeline achieves state-of-the-art performance +across multiple benchmarks and metrics. Additionally, we illustrate the +effectiveness of the proposed method through real-world demonstrations, +showcasing its practical utility. Our project page is at +https://jukgei.github.io/project/gic. + +
+
+ comment: 21 pages, 8 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ProbTalk3D: Non-Deterministic Emotion Controllable Speech-Driven 3D + Facial Animation Synthesis Using VQ-VAE SIGGRAPH + + +
+ Audio-driven 3D facial animation synthesis has been an active field of +research with attention from both academia and industry. While there are +promising results in this area, recent approaches largely focus on lip-sync and +identity control, neglecting the role of emotions and emotion control in the +generative process. That is mainly due to the lack of emotionally rich facial +animation data and algorithms that can synthesize speech animations with +emotional expressions at the same time. In addition, majority of the models are +deterministic, meaning given the same audio input, they produce the same output +motion. We argue that emotions and non-determinism are crucial to generate +diverse and emotionally-rich facial animations. In this paper, we propose +ProbTalk3D a non-deterministic neural network approach for emotion controllable +speech-driven 3D facial animation synthesis using a two-stage VQ-VAE model and +an emotionally rich facial animation dataset 3DMEAD. We provide an extensive +comparative analysis of our model against the recent 3D facial animation +synthesis approaches, by evaluating the results objectively, qualitatively, and +with a perceptual user study. We highlight several objective metrics that are +more suitable for evaluating stochastic outputs and use both in-the-wild and +ground truth data for subjective evaluation. To our knowledge, that is the +first non-deterministic 3D facial animation synthesis method incorporating a +rich emotion dataset and emotion control with emotion labels and intensity +levels. Our evaluation demonstrates that the proposed model achieves superior +performance compared to state-of-the-art emotion-controlled, deterministic and +non-deterministic models. We recommend watching the supplementary video for +quality judgement. The entire codebase is publicly available +(https://github.com/uuembodiedsocialai/ProbTalk3D/). + +
+
+ comment: 14 pages, 9 figures, 3 tables. Includes code. Accepted at ACM + SIGGRAPH MIG 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 21 + +
+
+
+ + ☆ Length-Induced Embedding Collapse in Transformer-based Models + + +
+ Text embeddings enable various applications, but their performance +deteriorates on longer texts. In this paper, we find that the performance +degradation is due to a phenomenon called Length Collapse, where longer text +embeddings collapse into a narrow space. This collapse results in a +distributional inconsistency between embeddings of different text lengths, +ultimately hurting the performance of downstream tasks. Theoretically, by +considering the self-attention mechanism inherently functions as a low-pass +filter, we prove that long sequences increase the attenuation rate of the +low-pass filter effect of the self-attention mechanism. With layers going +deeper, excessive low-pass filtering causes the token signals to retain only +their Direct-Current (DC) component, which means the input token feature maps +will collapse into a narrow space, especially in long texts. Based on the above +analysis, we propose to mitigate the undesirable length collapse limitation by +introducing a temperature in softmax(), which achieves a higher low-filter +attenuation rate. The tuning-free method, called TempScale, can be plugged into +multiple transformer-based embedding models. Empirically, we demonstrate that +TempScale can improve existing embedding models, especially on long text +inputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text +Embedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from +LongEmbed, which specifically focuses on long context retrieval. + +
+
+
+
+
+ + ☆ Investigating Bias in Political Search Query Suggestions by Relative + Comparison with LLMs + + +
+ Search query suggestions affect users' interactions with search engines, +which then influences the information they encounter. Thus, bias in search +query suggestions can lead to exposure to biased search results and can impact +opinion formation. This is especially critical in the political domain. +Detecting and quantifying bias in web search engines is difficult due to its +topic dependency, complexity, and subjectivity. The lack of context and +phrasality of query suggestions emphasizes this problem. In a multi-step +approach, we combine the benefits of large language models, pairwise +comparison, and Elo-based scoring to identify and quantify bias in English +search query suggestions. We apply our approach to the U.S. political news +domain and compare bias in Google and Bing. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Medical Information Extraction and + Query Generation + + +
+ This paper introduces a system that integrates large language models (LLMs) +into the clinical trial retrieval process, enhancing the effectiveness of +matching patients with eligible trials while maintaining information privacy +and allowing expert oversight. We evaluate six LLMs for query generation, +focusing on open-source and relatively small models that require minimal +computational resources. Our evaluation includes two closed-source and four +open-source models, with one specifically trained in the medical field and five +general-purpose models. We compare the retrieval effectiveness achieved by +LLM-generated queries against those created by medical experts and +state-of-the-art methods from the literature. Our findings indicate that the +evaluated models reach retrieval effectiveness on par with or greater than +expert-created queries. The LLMs consistently outperform standard baselines and +other approaches in the literature. The best performing LLMs exhibit fast +response times, ranging from 1.7 to 8 seconds, and generate a manageable number +of query terms (15-63 on average), making them suitable for practical +implementation. Our overall findings suggest that leveraging small, open-source +LLMs for clinical trials retrieval can balance performance, computational +efficiency, and real-world applicability in medical settings. + +
+
+ comment: Accepted in WI-IAT '24 +
+
+
+
+
+ + ☆ Auditing Google's Search Algorithm: Measuring News Diversity Across + Brazil, the UK, and the US + + +
+ This study examines the influence of Google's search algorithm on news +diversity by analyzing search results in Brazil, the UK, and the US. It +explores how Google's system preferentially favors a limited number of news +outlets. Utilizing algorithm auditing techniques, the research measures source +concentration with the Herfindahl-Hirschman Index (HHI) and Gini coefficient, +revealing significant concentration trends. The study underscores the +importance of conducting horizontal analyses across multiple search queries, as +focusing solely on individual results pages may obscure these patterns. Factors +such as popularity, political bias, and recency were evaluated for their impact +on news rankings. Findings indicate a slight leftward bias in search outcomes +and a preference for popular, often national outlets. This bias, combined with +a tendency to prioritize recent content, suggests that Google's algorithm may +reinforce existing media inequalities. By analyzing the largest dataset to date +-- 221,863 search results -- this research provides comprehensive, longitudinal +insights into how algorithms shape public access to diverse news sources. + +
+
+ comment: 21 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Beyond Content Relevance: Evaluating Instruction Following in Retrieval + Models + + +
+ Instruction-following capabilities in large language models (LLMs) have +significantly progressed, enabling more complex user interactions through +detailed prompts. However, retrieval systems have not matched these advances, +most of them still relies on traditional lexical and semantic matching +techniques that fail to fully capture user intent. Recent efforts have +introduced instruction-aware retrieval models, but these primarily focus on +intrinsic content relevance, which neglects the importance of customized +preferences for broader document-level attributes. This study evaluates the +instruction-following capabilities of various retrieval models beyond content +relevance, including LLM-based dense retrieval and reranking models. We develop +InfoSearch, a novel retrieval evaluation benchmark spanning six document-level +attributes: Audience, Keyword, Format, Language, Length, and Source, and +introduce novel metrics -- Strict Instruction Compliance Ratio (SICR) and +Weighted Instruction Sensitivity Evaluation (WISE) to accurately assess the +models' responsiveness to instructions. Our findings reveal that while +reranking models generally surpass retrieval models in instruction following, +they still face challenges in handling certain attributes. Moreover, although +instruction fine-tuning and increased model size lead to better performance, +most models fall short of achieving comprehensive instruction compliance as +assessed by our benchmark. + +
+
+
+
+
+ + ☆ Identify Then Recommend: Towards Unsupervised Group Recommendation + + +
+ Group Recommendation (GR), which aims to recommend items to groups of users, +has become a promising and practical direction for recommendation systems. This +paper points out two issues of the state-of-the-art GR models. (1) The +pre-defined and fixed number of user groups is inadequate for real-time +industrial recommendation systems, where the group distribution can shift +dynamically. (2) The training schema of existing GR methods is supervised, +necessitating expensive user-group and group-item labels, leading to +significant annotation costs. To this end, we present a novel unsupervised +group recommendation framework named \underline{I}dentify \underline{T}hen +\underline{R}ecommend (\underline{ITR}), where it first identifies the user +groups in an unsupervised manner even without the pre-defined number of groups, +and then two pre-text tasks are designed to conduct self-supervised group +recommendation. Concretely, at the group identification stage, we first +estimate the adaptive density of each user point, where areas with higher +densities are more likely to be recognized as group centers. Then, a heuristic +merge-and-split strategy is designed to discover the user groups and decision +boundaries. Subsequently, at the self-supervised learning stage, the +pull-and-repulsion pre-text task is proposed to optimize the user-group +distribution. Besides, the pseudo group recommendation pre-text task is +designed to assist the recommendations. Extensive experiments demonstrate the +superiority and effectiveness of ITR on both user recommendation (e.g., 22.22\% +NDCG@5 $\uparrow$) and group recommendation (e.g., 22.95\% NDCG@5 $\uparrow$). +Furthermore, we deploy ITR on the industrial recommender and achieve promising +results. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed + Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging vision-language task, +utilizing bi-modal (image+text) queries to retrieve target images. Despite the +impressive performance of supervised CIR, the dependence on costly, +manually-labeled triplets limits its scalability and zero-shot capability. To +address this issue, zero-shot composed image retrieval (ZS-CIR) is presented +along with projection-based approaches. However, such methods face two major +problems, i.e., task discrepancy between pre-training (image $\leftrightarrow$ +text) and inference (image+text $\rightarrow$ image), and modality discrepancy. +The latter pertains to approaches based on text-only projection training due to +the necessity of feature extraction from the reference image during inference. +In this paper, we propose a two-stage framework to tackle both discrepancies. +First, to ensure efficiency and scalability, a textual inversion network is +pre-trained on large-scale caption datasets. Subsequently, we put forward +Modality-Task Dual Alignment (MoTaDual) as the second stage, where +large-language models (LLMs) generate triplet data for fine-tuning, and +additionally, prompt learning is introduced in a multi-modal context to +effectively alleviate both modality and task discrepancies. The experimental +results show that our MoTaDual achieves the state-of-the-art performance across +four widely used ZS-CIR benchmarks, while maintaining low training time and +computational cost. The code will be released soon. + +
+
+
+
+
+ + ☆ Towards Cross-Modal Text-Molecule Retrieval with Better Modality + Alignment + + +
+ Cross-modal text-molecule retrieval model aims to learn a shared feature +space of the text and molecule modalities for accurate similarity calculation, +which facilitates the rapid screening of molecules with specific properties and +activities in drug design. However, previous works have two main defects. +First, they are inadequate in capturing modality-shared features considering +the significant gap between text sequences and molecule graphs. Second, they +mainly rely on contrastive learning and adversarial training for cross-modality +alignment, both of which mainly focus on the first-order similarity, ignoring +the second-order similarity that can capture more structural information in the +embedding space. To address these issues, we propose a novel cross-modal +text-molecule retrieval model with two-fold improvements. Specifically, on the +top of two modality-specific encoders, we stack a memory bank based feature +projector that contain learnable memory vectors to extract modality-shared +features better. More importantly, during the model training, we calculate four +kinds of similarity distributions (text-to-text, text-to-molecule, +molecule-to-molecule, and molecule-to-text similarity distributions) for each +instance, and then minimize the distance between these similarity distributions +(namely second-order similarity losses) to enhance cross-modal alignment. +Experimental results and analysis strongly demonstrate the effectiveness of our +model. Particularly, our model achieves SOTA performance, outperforming the +previously-reported best result by 6.4%. + +
+
+ comment: BIBM 2024 regular paper +
+
+
+
+
+ + ♻ ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ♻ ☆ User-Creator Feature Polarization in Recommender Systems with Dual + Influence NeurIPS 2024 + + +
+ Recommender systems serve the dual purpose of presenting relevant content to +users and helping content creators reach their target audience. The dual nature +of these systems naturally influences both users and creators: users' +preferences are affected by the items they are recommended, while creators may +be incentivized to alter their content to attract more users. We define a +model, called user-creator feature dynamics, to capture the dual influence of +recommender systems. We prove that a recommender system with dual influence is +guaranteed to polarize, causing diversity loss in the system. We then +investigate, both theoretically and empirically, approaches for mitigating +polarization and promoting diversity in recommender systems. Unexpectedly, we +find that common diversity-promoting approaches do not work in the presence of +dual influence, while relevancy-optimizing methods like top-$k$ truncation can +prevent polarization and improve diversity of the system. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Group Proportional Representation in Retrieval NeurIPS 2024 + + +
+ Image search and retrieval tasks can perpetuate harmful stereotypes, erase +cultural identities, and amplify social disparities. Current approaches to +mitigate these representational harms balance the number of retrieved items +across population groups defined by a small number of (often binary) +attributes. However, most existing methods overlook intersectional groups +determined by combinations of group attributes, such as gender, race, and +ethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel +metric that measures representation across intersectional groups. We develop +practical methods for estimating MPR, provide theoretical guarantees, and +propose optimization algorithms to ensure MPR in retrieval. We demonstrate that +existing methods optimizing for equal and proportional representation metrics +may fail to promote MPR. Crucially, our work shows that optimizing MPR yields +more proportional representation across multiple intersectional groups +specified by a rich function class, often with minimal compromise in retrieval +accuracy. + +
+
+ comment: 48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be + found at + https://github.com/alex-oesterling/multigroup-proportional-representation +
+
+
+
+
+ + ♻ ☆ Generating Multi-Aspect Queries for Conversational Search + + +
+ Conversational information seeking (CIS) systems aim to model the user's +information need within the conversational context and retrieve the relevant +information. One major approach to modeling the conversational context aims to +rewrite the user utterance in the conversation to represent the information +need independently. Recent work has shown the benefit of expanding the +rewritten utterance with relevant terms. In this work, we hypothesize that +breaking down the information of an utterance into multi-aspect rewritten +queries can lead to more effective retrieval performance. This is more evident +in more complex utterances that require gathering evidence from various +information sources, where a single query rewrite or query representation +cannot capture the complexity of the utterance. To test this hypothesis, we +conduct extensive experiments on five widely used CIS datasets where we +leverage LLMs to generate multi-aspect queries to represent the information +need for each utterance in multiple query rewrites. We show that, for most of +the utterances, the same retrieval model would perform better with more than +one rewritten query by 85% in terms of nDCG@3. We further propose a +multi-aspect query generation and retrieval framework, called MQ4CS. Our +extensive experiments show that MQ4CS outperforms the state-of-the-art query +rewriting methods. We make our code and our new dataset of generated +multi-aspect queries publicly available. + +
+
+
+
+
+ + ♻ ☆ FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven + Question Answering Pipeline + + +
+ Financial decision-making hinges on the analysis of relevant information +embedded in the enormous volume of documents in the financial domain. To +address this challenge, we developed FinQAPT, an end-to-end pipeline that +streamlines the identification of relevant financial reports based on a query, +extracts pertinent context, and leverages Large Language Models (LLMs) to +perform downstream tasks. To evaluate the pipeline, we experimented with +various techniques to optimize the performance of each module using the FinQA +dataset. We introduced a novel clustering-based negative sampling technique to +enhance context extraction and a novel prompting method called Dynamic N-shot +Prompting to boost the numerical question-answering capabilities of LLMs. At +the module level, we achieved state-of-the-art accuracy on FinQA, attaining an +accuracy of 80.6%. However, at the pipeline level, we observed decreased +performance due to challenges in extracting relevant context from financial +reports. We conducted a detailed error analysis of each module and the +end-to-end pipeline, pinpointing specific challenges that must be addressed to +develop a robust solution for handling complex financial tasks. + +
+
+ comment: Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ An engine not a camera: Measuring performative power of online search NeurIPS 2024 + + +
+ The power of digital platforms is at the center of major ongoing policy and +regulatory efforts. To advance existing debates, we designed and executed an +experiment to measure the performative power of online search providers. +Instantiated in our setting, performative power quantifies the ability of a +search engine to steer web traffic by rearranging results. To operationalize +this definition we developed a browser extension that performs unassuming +randomized experiments in the background. These randomized experiments emulate +updates to the search algorithm and identify the causal effect of different +content arrangements on clicks. Analyzing tens of thousands of clicks, we +discuss what our robust quantitative findings say about the power of online +search engines, using the Google Shopping antitrust investigation as a case +study. More broadly, we envision our work to serve as a blueprint for how the +recent definition of performative power can help integrate quantitative +insights from online experiments with future investigations into the economic +power of digital platforms. + +
+
+ comment: to appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing + the Upper Bound of Generative Retrieval + + +
+ Generative retrieval (GR) has emerged as a transformative paradigm in search +and recommender systems, leveraging numeric-based identifier representations to +enhance efficiency and generalization. Notably, methods like TIGER employing +Residual Quantization-based Semantic Identifiers (RQ-SID), have shown +significant promise in e-commerce scenarios by effectively managing item IDs. +However, a critical issue termed the "\textbf{Hourglass}" phenomenon, occurs in +RQ-SID, where intermediate codebook tokens become overly concentrated, +hindering the full utilization of generative retrieval methods. This paper +analyses and addresses this problem by identifying data sparsity and +long-tailed distribution as the primary causes. Through comprehensive +experiments and detailed ablation studies, we analyze the impact of these +factors on codebook utilization and data distribution. Our findings reveal that +the "Hourglass" phenomenon substantially impacts the performance of RQ-SID in +generative retrieval. We propose effective solutions to mitigate this issue, +thereby significantly enhancing the effectiveness of generative retrieval in +real-world E-commerce applications. + +
+
+
+
+
+ + ♻ ☆ UDA: A Benchmark Suite for Retrieval Augmented Generation in Real-world + Document Analysis + + +
+ The use of Retrieval-Augmented Generation (RAG) has improved Large Language +Models (LLMs) in collaborating with external data, yet significant challenges +exist in real-world scenarios. In areas such as academic literature and finance +question answering, data are often found in raw text and tables in HTML or PDF +formats, which can be lengthy and highly unstructured. In this paper, we +introduce a benchmark suite, namely Unstructured Document Analysis (UDA), that +involves 2,965 real-world documents and 29,590 expert-annotated Q&A pairs. We +revisit popular LLM- and RAG-based solutions for document analysis and evaluate +the design choices and answer qualities across multiple document domains and +diverse query types. Our evaluation yields interesting findings and highlights +the importance of data parsing and retrieval. We hope our benchmark can shed +light and better serve real-world document analysis applications. The benchmark +suite and code can be found at https://github.com/qinchuanhui/UDA-Benchmark. + +
+
+
+
+
+ + ♻ ☆ End-to-end Learnable Clustering for Intent Learning in Recommendation + + +
+ Intent learning, which aims to learn users' intents for user understanding +and item recommendation, has become a hot research spot in recent years. +However, existing methods suffer from complex and cumbersome alternating +optimization, limiting performance and scalability. To this end, we propose a +novel intent learning method termed \underline{ELCRec}, by unifying behavior +representation learning into an \underline{E}nd-to-end \underline{L}earnable +\underline{C}lustering framework, for effective and efficient +\underline{Rec}ommendation. Concretely, we encode user behavior sequences and +initialize the cluster centers (latent intents) as learnable neurons. Then, we +design a novel learnable clustering module to separate different cluster +centers, thus decoupling users' complex intents. Meanwhile, it guides the +network to learn intents from behaviors by forcing behavior embeddings close to +cluster centers. This allows simultaneous optimization of recommendation and +clustering via mini-batch data. Moreover, we propose intent-assisted +contrastive learning by using cluster centers as self-supervision signals, +further enhancing mutual promotion. Both experimental results and theoretical +analyses demonstrate the superiority of ELCRec from six perspectives. Compared +to the runner-up, ELCRec improves NDCG@5 by 8.9\% and reduces computational +costs by 22.5\% on the Beauty dataset. Furthermore, due to the scalability and +universal applicability, we deploy this method on the industrial recommendation +system with 130 million page views and achieve promising results. The codes are +available on GitHub (https://github.com/yueliu1999/ELCRec). A collection +(papers, codes, datasets) of deep group recommendation/intent learning methods +is available on GitHub +(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation). + +
+
+ comment: 37 pages +
+
+
+
+
+ + ♻ ☆ Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback + + +
+ RAG systems face limitations when semantic relevance alone does not guarantee +improved generation quality. This issue becomes particularly evident due to the +sensitivity of large language models (LLMs) to the ordering of few-shot +prompts, which can affect model performance. To address this challenge, +aligning LLM outputs with human preferences using structured feedback, such as +options to copy, regenerate, or dislike, offers a promising method for +improvement. This feedback is applied to the entire list of inputs rather than +giving specific ratings for individual documents, making it a Listwide Labels +Learning-to-Rank task. + To address this task, we propose Pistis-RAG, a new RAG framework designed +with a content-centric approach to better align LLMs with human preferences. +Pistis-RAG effectively utilizes human feedback, enhancing content ranking and +generation quality. To validate our framework, we use public datasets to +simulate human feedback, allowing us to evaluate and refine our method +effectively. Experimental results indicate that Pistis-RAG improves alignment +with human preferences relative to the baseline RAG system, showing a 6.06% +increase in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy +metrics. These results highlight Pistis-RAG's effectiveness in overcoming the +limitations associated with traditional RAG approaches. + +
+
+
+
+
+ + ♻ ☆ DTN: Deep Multiple Task-specific Feature Interactions Network for + Multi-Task Recommendation + + +
+ Neural-based multi-task learning (MTL) has been successfully applied to many +recommendation applications. However, these MTL models (e.g., MMoE, PLE) did +not consider feature interaction during the optimization, which is crucial for +capturing complex high-order features and has been widely used in ranking +models for real-world recommender systems. Moreover, through feature importance +analysis across various tasks in MTL, we have observed an interesting +divergence phenomenon that the same feature can have significantly different +importance across different tasks in MTL. To address these issues, we propose +Deep Multiple Task-specific Feature Interactions Network (DTN) with a novel +model structure design. DTN introduces multiple diversified task-specific +feature interaction methods and task-sensitive network in MTL networks, +enabling the model to learn task-specific diversified feature interaction +representations, which improves the efficiency of joint representation learning +in a general setup. We applied DTN to our company's real-world E-commerce +recommendation dataset, which consisted of over 6.3 billion samples, the +results demonstrated that DTN significantly outperformed state-of-the-art MTL +models. Moreover, during online evaluation of DTN in a large-scale E-commerce +recommender system, we observed a 3.28% in clicks, a 3.10% increase in orders +and a 2.70% increase in GMV (Gross Merchandise Value) compared to the +state-of-the-art MTL models. Finally, extensive offline experiments conducted +on public benchmark datasets demonstrate that DTN can be applied to various +scenarios beyond recommendations, enhancing the performance of ranking models. + +
+
+
+
+
+ + ♻ ☆ Microstructures and Accuracy of Graph Recall by Large Language Models NeurIPS 2024 + + +
+ Graphs data is crucial for many applications, and much of it exists in the +relations described in textual format. As a result, being able to accurately +recall and encode a graph described in earlier text is a basic yet pivotal +ability that LLMs need to demonstrate if they are to perform reasoning tasks +that involve graph-structured information. Human performance at graph recall +has been studied by cognitive scientists for decades, and has been found to +often exhibit certain structural patterns of bias that align with human +handling of social relationships. To date, however, we know little about how +LLMs behave in analogous graph recall tasks: do their recalled graphs also +exhibit certain biased patterns, and if so, how do they compare with humans and +affect other graph reasoning tasks? In this work, we perform the first +systematical study of graph recall by LLMs, investigating the accuracy and +biased microstructures (local structural patterns) in their recall. We find +that LLMs not only underperform often in graph recall, but also tend to favor +more triangles and alternating 2-paths. Moreover, we find that more advanced +LLMs have a striking dependence on the domain that a real-world graph comes +from -- by yielding the best recall accuracy when the graph is narrated in a +language style consistent with its original domain. + +
+
+ comment: Accepted at NeurIPS 2024; Code available at: + https://github.com/Abel0828/llm-graph-recall +
+
+
+
+
+ + ♻ ☆ Recommendation Unlearning via Influence Function + + +
+ Recommendation unlearning is an emerging task to serve users for erasing +unusable data (e.g., some historical behaviors) from a well-trained recommender +model. Existing methods process unlearning requests by fully or partially +retraining the model after removing the unusable data. However, these methods +are impractical due to the high computation cost of full retraining and the +highly possible performance damage of partial training. In this light, a +desired recommendation unlearning method should obtain a similar model as full +retraining in a more efficient manner, i.e., achieving complete, efficient and +harmless unlearning. + In this work, we propose a new Influence Function-based Recommendation +Unlearning (IFRU) framework, which efficiently updates the model without +retraining by estimating the influence of the unusable data on the model via +the influence function. In the light that recent recommender models use +historical data for both the constructions of the optimization loss and the +computational graph (e.g., neighborhood aggregation), IFRU jointly estimates +the direct influence of unusable data on optimization loss and the spillover +influence on the computational graph to pursue complete unlearning. +Furthermore, we propose an importance-based pruning algorithm to reduce the +cost of the influence function. IFRU is harmless and applicable to mainstream +differentiable models. Extensive experiments demonstrate that IFRU achieves +more than 250 times acceleration compared to retraining-based methods with +recommendation performance comparable to full retraining. Codes are avaiable at +https://github.com/baiyimeng/IFRU. + +
+
+ comment: Accepted by ACM TORS +
+
+
+
+
+
+
+
+ + Machine Learning 65 + +
+
+
+ + ☆ Robust Gaussian Processes via Relevance Pursuit NeurIPS 2024 + + +
+ Gaussian processes (GPs) are non-parametric probabilistic regression models +that are popular due to their flexibility, data efficiency, and well-calibrated +uncertainty estimates. However, standard GP models assume homoskedastic +Gaussian noise, while many real-world applications are subject to non-Gaussian +corruptions. Variants of GPs that are more robust to alternative noise models +have been proposed, and entail significant trade-offs between accuracy and +robustness, and between computational requirements and theoretical guarantees. +In this work, we propose and study a GP model that achieves robustness against +sparse outliers by inferring data-point-specific noise levels with a sequential +selection procedure maximizing the log marginal likelihood that we refer to as +relevance pursuit. We show, surprisingly, that the model can be parameterized +such that the associated log marginal likelihood is strongly concave in the +data-point-specific noise variances, a property rarely found in either robust +regression objectives or GP marginal likelihoods. This in turn implies the weak +submodularity of the corresponding subset selection problem, and thereby proves +approximation guarantees for the proposed algorithm. We compare the model's +performance relative to other approaches on diverse regression and Bayesian +optimization tasks, including the challenging but common setting of sparse +corruptions of the labels within or close to the function range. + +
+
+ comment: NeurIPS 2024 Article +
+
+
+
+
+ + ☆ Bridging Geometric States via Geometric Diffusion Bridge NeurIPS 2024 + + +
+ The accurate prediction of geometric state evolution in complex systems is +critical for advancing scientific domains such as quantum chemistry and +material modeling. Traditional experimental and computational methods face +challenges in terms of environmental constraints and computational demands, +while current deep learning approaches still fall short in terms of precision +and generality. In this work, we introduce the Geometric Diffusion Bridge +(GDB), a novel generative modeling framework that accurately bridges initial +and target geometric states. GDB leverages a probabilistic approach to evolve +geometric state distributions, employing an equivariant diffusion bridge +derived by a modified version of Doob's $h$-transform for connecting geometric +states. This tailored diffusion process is anchored by initial and target +geometric states as fixed endpoints and governed by equivariant transition +kernels. Moreover, trajectory data can be seamlessly leveraged in our GDB +framework by using a chain of equivariant diffusion bridges, providing a more +detailed and accurate characterization of evolution dynamics. Theoretically, we +conduct a thorough examination to confirm our framework's ability to preserve +joint distributions of geometric states and capability to completely model the +underlying dynamics inducing trajectory distributions with negligible error. +Experimental evaluations across various real-world scenarios show that GDB +surpasses existing state-of-the-art approaches, opening up a new pathway for +accurately bridging geometric states and tackling crucial scientific challenges +with improved accuracy and applicability. + +
+
+ comment: 33 pages, 5 tables; NeurIPS 2024 Camera Ready version +
+
+
+
+
+ + ☆ Teaching Embodied Reinforcement Learning Agents: Informativeness and + Diversity of Language Use EMNLP 2024 + + +
+ In real-world scenarios, it is desirable for embodied agents to have the +ability to leverage human language to gain explicit or implicit knowledge for +learning tasks. Despite recent progress, most previous approaches adopt simple +low-level instructions as language inputs, which may not reflect natural human +communication. It's not clear how to incorporate rich language use to +facilitate task learning. To address this question, this paper studies +different types of language inputs in facilitating reinforcement learning (RL) +embodied agents. More specifically, we examine how different levels of language +informativeness (i.e., feedback on past behaviors and future guidance) and +diversity (i.e., variation of language expressions) impact agent learning and +inference. Our empirical results based on four RL benchmarks demonstrate that +agents trained with diverse and informative language feedback can achieve +enhanced generalization and fast adaptation to new tasks. These findings +highlight the pivotal role of language use in teaching embodied agents new +tasks in an open world. Project website: +https://github.com/sled-group/Teachable_RL + +
+
+ comment: EMNLP 2024 Main. Project website: + https://github.com/sled-group/Teachable_RL +
+
+
+
+
+ + ☆ CaAdam: Improving Adam optimizer using connection aware methods + + +
+ We introduce a new method inspired by Adam that enhances convergence speed +and achieves better loss function minima. Traditional optimizers, including +Adam, apply uniform or globally adjusted learning rates across neural networks +without considering their architectural specifics. This architecture-agnostic +approach is deeply embedded in most deep learning frameworks, where optimizers +are implemented as standalone modules without direct access to the network's +structural information. For instance, in popular frameworks like Keras or +PyTorch, optimizers operate solely on gradients and parameters, without +knowledge of layer connectivity or network topology. Our algorithm, CaAdam, +explores this overlooked area by introducing connection-aware optimization +through carefully designed proxies of architectural information. We propose +multiple scaling methodologies that dynamically adjust learning rates based on +easily accessible structural properties such as layer depth, connection counts, +and gradient distributions. This approach enables more granular optimization +while working within the constraints of current deep learning frameworks. +Empirical evaluations on standard datasets (e.g., CIFAR-10, Fashion MNIST) show +that our method consistently achieves faster convergence and higher accuracy +compared to standard Adam optimizer, demonstrating the potential benefits of +incorporating architectural awareness in optimization strategies. + +
+
+
+
+
+ + ☆ ARQ: A Mixed-Precision Quantization Framework for Accurate and + Certifiably Robust DNNs + + +
+ Mixed precision quantization has become an important technique for enabling +the execution of deep neural networks (DNNs) on limited resource computing +platforms. Traditional quantization methods have primarily concentrated on +maintaining neural network accuracy, either ignoring the impact of quantization +on the robustness of the network, or using only empirical techniques for +improving robustness. In contrast, techniques for robustness certification, +which can provide strong guarantees about the robustness of DNNs have not been +used during quantization due to their high computation cost. + This paper introduces ARQ, an innovative mixed-precision quantization method +that not only preserves the clean accuracy of the smoothed classifiers but also +maintains their certified robustness. ARQ uses reinforcement learning to find +accurate and robust DNN quantization, while efficiently leveraging randomized +smoothing, a popular class of statistical DNN verification algorithms, to guide +the search process. + We compare ARQ with multiple state-of-the-art quantization techniques on +several DNN architectures commonly used in quantization studies: ResNet-20 on +CIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate +that ARQ consistently performs better than these baselines across all the +benchmarks and the input perturbation levels. In many cases, the performance of +ARQ quantized networks can reach that of the original DNN with floating-point +weights, but with only 1.5% instructions. + +
+
+
+
+
+ + ☆ TabM: Advancing Tabular Deep Learning with Parameter-Efficient + Ensembling + + +
+ Deep learning architectures for supervised learning on tabular data range +from simple multilayer perceptrons (MLP) to sophisticated Transformers and +retrieval-augmented methods. This study highlights a major, yet so far +overlooked opportunity for substantially improving tabular MLPs: namely, +parameter-efficient ensembling -- a paradigm for implementing an ensemble of +models as one model producing multiple predictions. We start by developing TabM +-- a simple model based on MLP and our variations of BatchEnsemble (an existing +technique). Then, we perform a large-scale evaluation of tabular DL +architectures on public benchmarks in terms of both task performance and +efficiency, which renders the landscape of tabular DL in a new light. +Generally, we show that MLPs, including TabM, form a line of stronger and more +practical models compared to attention- and retrieval-based architectures. In +particular, we find that TabM demonstrates the best performance among tabular +DL models. Lastly, we conduct an empirical analysis on the ensemble-like nature +of TabM. For example, we observe that the multiple predictions of TabM are weak +individually, but powerful collectively. Overall, our work brings an impactful +technique to tabular DL, analyses its behaviour, and advances the +performance-efficiency trade-off with TabM -- a simple and powerful baseline +for researchers and practitioners. + +
+
+ comment: Code: https://github.com/yandex-research/tabm +
+
+
+
+
+ + ☆ Understanding Optimization in Deep Learning with Central Flows + + +
+ Optimization in deep learning remains poorly understood, even in the simple +setting of deterministic (i.e. full-batch) training. A key difficulty is that +much of an optimizer's behavior is implicitly determined by complex oscillatory +dynamics, referred to as the "edge of stability." The main contribution of this +paper is to show that an optimizer's implicit behavior can be explicitly +captured by a "central flow:" a differential equation which models the +time-averaged optimization trajectory. We show that these flows can empirically +predict long-term optimization trajectories of generic neural networks with a +high degree of numerical accuracy. By interpreting these flows, we reveal for +the first time 1) the precise sense in which RMSProp adapts to the local loss +landscape, and 2) an "acceleration via regularization" mechanism, wherein +adaptive optimizers implicitly navigate towards low-curvature regions in which +they can take larger steps. This mechanism is key to the efficacy of these +adaptive optimizers. Overall, we believe that central flows constitute a +promising tool for reasoning about optimization in deep learning. + +
+
+ comment: first two authors contributed equally; author order determined by + coin flip +
+
+
+
+
+ + ☆ DexMimicGen: Automated Data Generation for Bimanual Dexterous + Manipulation via Imitation Learning + + +
+ Imitation learning from human demonstrations is an effective means to teach +robots manipulation skills. But data acquisition is a major bottleneck in +applying this paradigm more broadly, due to the amount of cost and human effort +involved. There has been significant interest in imitation learning for +bimanual dexterous robots, like humanoids. Unfortunately, data collection is +even more challenging here due to the challenges of simultaneously controlling +multiple arms and multi-fingered hands. Automated data generation in simulation +is a compelling, scalable alternative to fuel this need for data. To this end, +we introduce DexMimicGen, a large-scale automated data generation system that +synthesizes trajectories from a handful of human demonstrations for humanoid +robots with dexterous hands. We present a collection of simulation environments +in the setting of bimanual dexterous manipulation, spanning a range of +manipulation behaviors and different requirements for coordination among the +two arms. We generate 21K demos across these tasks from just 60 source human +demos and study the effect of several data generation and policy learning +decisions on agent performance. Finally, we present a real-to-sim-to-real +pipeline and deploy it on a real-world humanoid can sorting task. Videos and +more are at https://dexmimicgen.github.io/ + +
+
+ comment: Project website: https://dexmimicgen.github.io/ +
+
+
+
+
+ + ☆ AR-Pro: Counterfactual Explanations for Anomaly Repair with Formal + Properties + + +
+ Anomaly detection is widely used for identifying critical errors and +suspicious behaviors, but current methods lack interpretability. We leverage +common properties of existing methods and recent advances in generative models +to introduce counterfactual explanations for anomaly detection. Given an input, +we generate its counterfactual as a diffusion-based repair that shows what a +non-anomalous version should have looked like. A key advantage of this approach +is that it enables a domain-independent formal specification of explainability +desiderata, offering a unified framework for generating and evaluating +explanations. We demonstrate the effectiveness of our anomaly explainability +framework, AR-Pro, on vision (MVTec, VisA) and time-series (SWaT, WADI, HAI) +anomaly datasets. The code used for the experiments is accessible at: +https://github.com/xjiae/arpro. + +
+
+
+
+
+ + ☆ DC-Spin: A Speaker-invariant Speech Tokenizer for Spoken Language Models + + +
+ Spoken language models (SLMs) have gained increasing attention with +advancements in text-based, decoder-only language models. SLMs process text and +speech, enabling simultaneous speech understanding and generation. This paper +presents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to +improve speech tokenization by bridging audio signals and SLM tokens. DC-Spin +extracts speaker-invariant tokens rich in phonetic information and resilient to +input variations, enhancing zero-shot SLM tasks and speech resynthesis. We +propose a chunk-wise approach to enable streamable DC-Spin without retraining +and degradation. Comparisons of tokenization methods (self-supervised and +neural audio codecs), model scalability, and downstream task proxies show that +tokens easily modeled by an n-gram LM or aligned with phonemes offer strong +performance, providing insights for designing speech tokenizers for SLMs. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ The Importance of Being Scalable: Improving the Speed and Accuracy of + Neural Network Interatomic Potentials Across Chemical Domains NeurIPS 2024 + + +
+ Scaling has been critical in improving model performance and generalization +in machine learning. It involves how a model's performance changes with +increases in model size or input data, as well as how efficiently computational +resources are utilized to support this growth. Despite successes in other +areas, the study of scaling in Neural Network Interatomic Potentials (NNIPs) +remains limited. NNIPs act as surrogate models for ab initio quantum mechanical +calculations. The dominant paradigm here is to incorporate many physical domain +constraints into the model, such as rotational equivariance. We contend that +these complex constraints inhibit the scaling ability of NNIPs, and are likely +to lead to performance plateaus in the long run. In this work, we take an +alternative approach and start by systematically studying NNIP scaling +strategies. Our findings indicate that scaling the model through attention +mechanisms is efficient and improves model expressivity. These insights +motivate us to develop an NNIP architecture designed for scalability: the +Efficiently Scaled Attention Interatomic Potential (EScAIP). EScAIP leverages a +multi-head self-attention formulation within graph neural networks, applying +attention at the neighbor-level representations. Implemented with +highly-optimized attention GPU kernels, EScAIP achieves substantial gains in +efficiency--at least 10x faster inference, 5x less memory usage--compared to +existing NNIPs. EScAIP also achieves state-of-the-art performance on a wide +range of datasets including catalysts (OC20 and OC22), molecules (SPICE), and +materials (MPTrj). We emphasize that our approach should be thought of as a +philosophy rather than a specific model, representing a proof-of-concept for +developing general-purpose NNIPs that achieve better expressivity through +scaling, and continue to scale efficiently with increased computational +resources and training data. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Approaches to human activity recognition via passive radar + + +
+ The thesis explores novel methods for Human Activity Recognition (HAR) using +passive radar with a focus on non-intrusive Wi-Fi Channel State Information +(CSI) data. Traditional HAR approaches often use invasive sensors like cameras +or wearables, raising privacy issues. This study leverages the non-intrusive +nature of CSI, using Spiking Neural Networks (SNN) to interpret signal +variations caused by human movements. These networks, integrated with symbolic +reasoning frameworks such as DeepProbLog, enhance the adaptability and +interpretability of HAR systems. SNNs offer reduced power consumption, ideal +for privacy-sensitive applications. Experimental results demonstrate SNN-based +neurosymbolic models achieve high accuracy making them a promising alternative +for HAR across various domains. + +
+
+
+
+
+ + ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ☆ Conformalized Prediction of Post-Fault Voltage Trajectories Using + Pre-trained and Finetuned Attention-Driven Neural Operators + + +
+ This paper proposes a new data-driven methodology for predicting intervals of +post-fault voltage trajectories in power systems. We begin by introducing the +Quantile Attention-Fourier Deep Operator Network (QAF-DeepONet), designed to +capture the complex dynamics of voltage trajectories and reliably estimate +quantiles of the target trajectory without any distributional assumptions. The +proposed operator regression model maps the observed portion of the voltage +trajectory to its unobserved post-fault trajectory. Our methodology employs a +pre-training and fine-tuning process to address the challenge of limited data +availability. To ensure data privacy in learning the pre-trained model, we use +merging via federated learning with data from neighboring buses, enabling the +model to learn the underlying voltage dynamics from such buses without directly +sharing their data. After pre-training, we fine-tune the model with data from +the target bus, allowing it to adapt to unique dynamics and operating +conditions. Finally, we integrate conformal prediction into the fine-tuned +model to ensure coverage guarantees for the predicted intervals. We evaluated +the performance of the proposed methodology using the New England 39-bus test +system considering detailed models of voltage and frequency controllers. Two +metrics, Prediction Interval Coverage Probability (PICP) and Prediction +Interval Normalized Average Width (PINAW), are used to numerically assess the +model's performance in predicting intervals. The results show that the proposed +approach offers practical and reliable uncertainty quantification in predicting +the interval of post-fault voltage trajectories. + +
+
+
+
+
+ + ☆ Dense Associative Memory Through the Lens of Random Features NeurIPS 2024 + + +
+ Dense Associative Memories are high storage capacity variants of the Hopfield +networks that are capable of storing a large number of memory patterns in the +weights of the network of a given size. Their common formulations typically +require storing each pattern in a separate set of synaptic weights, which leads +to the increase of the number of synaptic weights when new patterns are +introduced. In this work we propose an alternative formulation of this class of +models using random features, commonly used in kernel methods. In this +formulation the number of network's parameters remains fixed. At the same time, +new memories can be added to the network by modifying existing weights. We show +that this novel network closely approximates the energy function and dynamics +of conventional Dense Associative Memories and shares their desirable +computational properties. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Conformal prediction of circular data + + +
+ Split conformal prediction techniques are applied to regression problems with +circular responses by introducing a suitable conformity score, leading to +prediction sets with adaptive arc length and finite-sample coverage guarantees +for any circular predictive model under exchangeable data. Leveraging the high +performance of existing predictive models designed for linear responses, we +analyze a general projection procedure that converts any linear response +regression model into one suitable for circular responses. When random forests +serve as basis models in this projection procedure, we harness the out-of-bag +dynamics to eliminate the necessity for a separate calibration sample in the +construction of prediction sets. For synthetic and real datasets the resulting +projected random forests model produces more efficient out-of-bag conformal +prediction sets, with shorter median arc length, when compared to the split +conformal prediction sets generated by two existing alternative models. + +
+
+ comment: 7 pages; 4 figures +
+
+
+
+
+ + ♻ ☆ From Linear to Linearizable Optimization: A Novel Framework with + Applications to Stationary and Non-stationary DR-submodular Optimization NeurIPS + 2024 + + +
+ This paper introduces the notion of upper-linearizable/quadratizable +functions, a class that extends concavity and DR-submodularity in various +settings, including monotone and non-monotone cases over different convex sets. +A general meta-algorithm is devised to convert algorithms for linear/quadratic +maximization into ones that optimize upper-linearizable/quadratizable +functions, offering a unified approach to tackling concave and DR-submodular +optimization problems. The paper extends these results to multiple feedback +settings, facilitating conversions between semi-bandit/first-order feedback and +bandit/zeroth-order feedback, as well as between first/zeroth-order feedback +and semi-bandit/bandit feedback. Leveraging this framework, new algorithms are +derived using existing results as base algorithms for convex optimization, +improving upon state-of-the-art results in various cases. Dynamic and adaptive +regret guarantees are obtained for DR-submodular maximization, marking the +first algorithms to achieve such guarantees in these settings. Notably, the +paper achieves these advancements with fewer assumptions compared to existing +state-of-the-art results, underscoring its broad applicability and theoretical +contributions to non-convex optimization. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ Still More Shades of Null: An Evaluation Suite for Responsible Missing + Value Imputation + + +
+ Data missingness is a practical challenge of sustained interest to the +scientific community. In this paper, we present Shades-of-Null, an evaluation +suite for responsible missing value imputation. Our work is novel in two ways +(i) we model realistic and socially-salient missingness scenarios that go +beyond Rubin's classic Missing Completely at Random (MCAR), Missing At Random +(MAR) and Missing Not At Random (MNAR) settings, to include multi-mechanism +missingness (when different missingness patterns co-exist in the data) and +missingness shift (when the missingness mechanism changes between training and +test) (ii) we evaluate imputers holistically, based on imputation quality, as +well as on the predictive performance, fairness and stability of the models +that are trained and tested on the data post-imputation. + We use Shades-of-Null to conduct a large-scale empirical study involving +23,940 experimental pipelines, and find that while there is no single +best-performing imputation approach for all missingness types, interesting +trade-offs arise between predictive performance, fairness and stability, based +on the combination of missingness scenario, imputer choice, and the +architecture of the predictive model. We make Shades-of-Null publicly +available, to enable researchers to rigorously evaluate missing value +imputation methods on a wide range of metrics in plausible and socially +meaningful scenarios. + +
+
+
+
+
+ + ♻ ☆ OPERA: Automatic Offline Policy Evaluation with Re-weighted Aggregates + of Multiple Estimators + + +
+ Offline policy evaluation (OPE) allows us to evaluate and estimate a new +sequential decision-making policy's performance by leveraging historical +interaction data collected from other policies. Evaluating a new policy online +without a confident estimate of its performance can lead to costly, unsafe, or +hazardous outcomes, especially in education and healthcare. Several OPE +estimators have been proposed in the last decade, many of which have +hyperparameters and require training. Unfortunately, choosing the best OPE +algorithm for each task and domain is still unclear. In this paper, we propose +a new algorithm that adaptively blends a set of OPE estimators given a dataset +without relying on an explicit selection using a statistical procedure. We +prove that our estimator is consistent and satisfies several desirable +properties for policy evaluation. Additionally, we demonstrate that when +compared to alternative approaches, our estimator can be used to select +higher-performing policies in healthcare and robotics. Our work contributes to +improving ease of use for a general-purpose, estimator-agnostic, off-policy +evaluation framework for offline RL. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Diverse Explanations From Data-Driven and Domain-Driven Perspectives in + the Physical Sciences + + +
+ Machine learning methods have been remarkably successful in material science, +providing novel scientific insights, guiding future laboratory experiments, and +accelerating materials discovery. Despite the promising performance of these +models, understanding the decisions they make is also essential to ensure the +scientific value of their outcomes. However, there is a recent and ongoing +debate about the diversity of explanations, which potentially leads to +scientific inconsistency. This Perspective explores the sources and +implications of these diverse explanations in ML applications for physical +sciences. Through three case studies in materials science and molecular +property prediction, we examine how different models, explanation methods, +levels of feature attribution, and stakeholder needs can result in varying +interpretations of ML outputs. Our analysis underscores the importance of +considering multiple perspectives when interpreting ML models in scientific +contexts and highlights the critical need for scientists to maintain control +over the interpretation process, balancing data-driven insights with domain +expertise to meet specific scientific needs. By fostering a comprehensive +understanding of these inconsistencies, we aim to contribute to the responsible +integration of eXplainable Artificial Intelligence (XAI) into physical sciences +and improve the trustworthiness of ML applications in scientific discovery. + +
+
+
+
+
+ + ♻ ☆ Deep Submodular Peripteral Networks NeurIPS 2024 + + +
+ Submodular functions, crucial for various applications, often lack practical +learning methods for their acquisition. Seemingly unrelated, learning a scaling +from oracles offering graded pairwise preferences (GPC) is underexplored, +despite a rich history in psychometrics. In this paper, we introduce deep +submodular peripteral networks (DSPNs), a novel parametric family of submodular +functions, and methods for their training using a GPC-based strategy to connect +and then tackle both of the above challenges. We introduce newly devised +GPC-style ``peripteral'' loss which leverages numerically graded relationships +between pairs of objects (sets in our case). Unlike traditional contrastive +learning, or RHLF preference ranking, our method utilizes graded comparisons, +extracting more nuanced information than just binary-outcome comparisons, and +contrasts sets of any size (not just two). We also define a novel suite of +automatic sampling strategies for training, including active-learning inspired +submodular feedback. We demonstrate DSPNs' efficacy in learning submodularity +from a costly target submodular function and demonstrate its superiority both +for experimental design and online streaming applications. + +
+
+ comment: Accepted at NeurIPS 2024 as spotlight presentation +
+
+
+
+
+ + ♻ ☆ Convergence for Natural Policy Gradient on Infinite-State Queueing MDPs + + +
+ A wide variety of queueing systems can be naturally modeled as infinite-state +Markov Decision Processes (MDPs). In the reinforcement learning (RL) context, a +variety of algorithms have been developed to learn and optimize these MDPs. At +the heart of many popular policy-gradient based learning algorithms, such as +natural actor-critic, TRPO, and PPO, lies the Natural Policy Gradient (NPG) +policy optimization algorithm. Convergence results for these RL algorithms rest +on convergence results for the NPG algorithm. However, all existing results on +the convergence of the NPG algorithm are limited to finite-state settings. + We study a general class of queueing MDPs, and prove a $O(1/\sqrt{T})$ +convergence rate for the NPG algorithm, if the NPG algorithm is initialized +with the MaxWeight policy. This is the first convergence rate bound for the NPG +algorithm for a general class of infinite-state average-reward MDPs. Moreover, +our result applies to a beyond the queueing setting to any countably-infinite +MDP satisfying certain mild structural assumptions, given a sufficiently good +initial policy. Key to our result are state-dependent bounds on the relative +value function achieved by the iterate policies of the NPG algorithm. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ GRANOLA: Adaptive Normalization for Graph Neural Networks NeurIPS 2024 + + +
+ In recent years, significant efforts have been made to refine the design of +Graph Neural Network (GNN) layers, aiming to overcome diverse challenges, such +as limited expressive power and oversmoothing. Despite their widespread +adoption, the incorporation of off-the-shelf normalization layers like +BatchNorm or InstanceNorm within a GNN architecture may not effectively capture +the unique characteristics of graph-structured data, potentially reducing the +expressive power of the overall architecture. Moreover, existing graph-specific +normalization layers often struggle to offer substantial and consistent +benefits. In this paper, we propose GRANOLA, a novel graph-adaptive +normalization layer. Unlike existing normalization layers, GRANOLA normalizes +node features by adapting to the specific characteristics of the graph, +particularly by generating expressive representations of its neighborhood +structure, obtained by leveraging the propagation of Random Node Features (RNF) +in the graph. We present theoretical results that support our design choices. +Our extensive empirical evaluation of various graph benchmarks underscores the +superior performance of GRANOLA over existing normalization techniques. +Furthermore, GRANOLA emerges as the top-performing method among all baselines +within the same time complexity of Message Passing Neural Networks (MPNNs). + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ♻ ☆ Monomial Matrix Group Equivariant Neural Functional Networks NeurIPS 2024 + + +
+ Neural functional networks (NFNs) have recently gained significant attention +due to their diverse applications, ranging from predicting network +generalization and network editing to classifying implicit neural +representation. Previous NFN designs often depend on permutation symmetries in +neural networks' weights, which traditionally arise from the unordered +arrangement of neurons in hidden layers. However, these designs do not take +into account the weight scaling symmetries of $\ReLU$ networks, and the weight +sign flipping symmetries of $\sin$ or $\Tanh$ networks. In this paper, we +extend the study of the group action on the network weights from the group of +permutation matrices to the group of monomial matrices by incorporating +scaling/sign-flipping symmetries. Particularly, we encode these +scaling/sign-flipping symmetries by designing our corresponding equivariant and +invariant layers. We name our new family of NFNs the Monomial Matrix Group +Equivariant Neural Functional Networks (Monomial-NFN). Because of the expansion +of the symmetries, Monomial-NFN has much fewer independent trainable parameters +compared to the baseline NFNs in the literature, thus enhancing the model's +efficiency. Moreover, for fully connected and convolutional neural networks, we +theoretically prove that all groups that leave these networks invariant while +acting on their weight spaces are some subgroups of the monomial matrix group. +We provide empirical evidence to demonstrate the advantages of our model over +existing baselines, achieving competitive performance and efficiency. + +
+
+ comment: 10 pages in the main text. Published at NeurIPS 2024. The code is + available at https://github.com/MathematicalAI-NUS/Monomial-NFN +
+
+
+
+
+ + ♻ ☆ Prompt Tuning Strikes Back: Customizing Foundation Models with Low-Rank + Prompt Adaptation + + +
+ Parameter-Efficient Fine-Tuning (PEFT) has become the standard for +customising Foundation Models (FMs) to user-specific downstream tasks. However, +typical PEFT methods require storing multiple task-specific adapters, creating +scalability issues as these adapters must be housed and run at the FM server. +Traditional prompt tuning offers a potential solution by customising them +through task-specific input prefixes, but it under-performs compared to other +PEFT methods like LoRA. To address this gap, we propose Low-Rank Prompt +Adaptation (LoPA), a prompt-tuning-based approach that performs on par with +state-of-the-art PEFT methods and full fine-tuning while being more +parameter-efficient and not requiring a server-based adapter. LoPA generates +soft prompts by balancing between sharing task-specific information across +instances and customization for each instance. It uses a low-rank decomposition +of the soft-prompt component encoded for each instance to achieve parameter +efficiency. We provide a comprehensive evaluation on multiple natural language +understanding and code generation and understanding tasks across a wide range +of foundation models with varying sizes. + +
+
+ comment: 14 pages, 8 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large + Language Models NeurIPS + 2024 + + +
+ Jailbreak attacks cause large language models (LLMs) to generate harmful, +unethical, or otherwise objectionable content. Evaluating these attacks +presents a number of challenges, which the current collection of benchmarks and +evaluation techniques do not adequately address. First, there is no clear +standard of practice regarding jailbreaking evaluation. Second, existing works +compute costs and success rates in incomparable ways. And third, numerous works +are not reproducible, as they withhold adversarial prompts, involve +closed-source code, or rely on evolving proprietary APIs. To address these +challenges, we introduce JailbreakBench, an open-sourced benchmark with the +following components: (1) an evolving repository of state-of-the-art +adversarial prompts, which we refer to as jailbreak artifacts; (2) a +jailbreaking dataset comprising 100 behaviors -- both original and sourced from +prior work (Zou et al., 2023; Mazeika et al., 2023, 2024) -- which align with +OpenAI's usage policies; (3) a standardized evaluation framework at +https://github.com/JailbreakBench/jailbreakbench that includes a clearly +defined threat model, system prompts, chat templates, and scoring functions; +and (4) a leaderboard at https://jailbreakbench.github.io/ that tracks the +performance of attacks and defenses for various LLMs. We have carefully +considered the potential ethical implications of releasing this benchmark, and +believe that it will be a net positive for the community. + +
+
+ comment: The camera-ready version of JailbreakBench v1.0 (accepted at NeurIPS + 2024 Datasets and Benchmarks Track): more attack artifacts, more test-time + defenses, a more accurate jailbreak judge (Llama-3-70B with a custom prompt), + a larger dataset of human preferences for selecting a jailbreak judge (300 + examples), an over-refusal evaluation dataset, a semantic refusal judge based + on Llama-3-8B +
+
+
+
+
+ + ♻ ☆ Med-Real2Sim: Non-Invasive Medical Digital Twins using Physics-Informed + Self-Supervised Learning + + +
+ A digital twin is a virtual replica of a real-world physical phenomena that +uses mathematical modeling to characterize and simulate its defining features. +By constructing digital twins for disease processes, we can perform in-silico +simulations that mimic patients' health conditions and counterfactual outcomes +under hypothetical interventions in a virtual setting. This eliminates the need +for invasive procedures or uncertain treatment decisions. In this paper, we +propose a method to identify digital twin model parameters using only +noninvasive patient health data. We approach the digital twin modeling as a +composite inverse problem, and observe that its structure resembles pretraining +and finetuning in self-supervised learning (SSL). Leveraging this, we introduce +a physics-informed SSL algorithm that initially pretrains a neural network on +the pretext task of learning a differentiable simulator of a physiological +process. Subsequently, the model is trained to reconstruct physiological +measurements from noninvasive modalities while being constrained by the +physical equations learned in pretraining. We apply our method to identify +digital twins of cardiac hemodynamics using noninvasive echocardiogram videos, +and demonstrate its utility in unsupervised disease detection and in-silico +clinical trials. + +
+
+
+
+
+ + ♻ ☆ Generalized Principal-Agent Problem with a Learning Agent + + +
+ Classic principal-agent problems such as Stackelberg games, contract design, +and Bayesian persuasion, often assume that the agent is able to best respond to +the principal's committed strategy. We study repeated generalized +principal-agent problems under the assumption that the principal does not have +commitment power and the agent uses algorithms to learn to respond to the +principal. We reduce this problem to a one-shot generalized principal-agent +problem where the agent approximately best responds. Using this reduction, we +show that: (1) If the agent uses contextual no-regret learning algorithms with +regret $\mathrm{Reg}(T)$, then the principal can guarantee utility at least +$U^* - \Theta\big(\sqrt{\tfrac{\mathrm{Reg}(T)}{T}}\big)$, where $U^*$ is the +principal's optimal utility in the classic model with a best-responding agent. +(2) If the agent uses contextual no-swap-regret learning algorithms with +swap-regret $\mathrm{SReg}(T)$, then the principal cannot obtain utility more +than $U^* + O(\frac{\mathrm{SReg(T)}}{T})$. But (3) if the agent uses +mean-based learning algorithms (which can be no-regret but not no-swap-regret), +then the principal can sometimes do significantly better than $U^*$. These +results not only refine previous results in Stackelberg games and contract +design, but also lead to new results for Bayesian persuasion with a learning +agent and all generalized principal-agent problems where the agent does not +have private information. + +
+
+
+
+
+ + ♻ ☆ LLMs are Highly-Constrained Biophysical Sequence Optimizers + + +
+ Large language models (LLMs) have recently shown significant potential in +various biological tasks such as protein engineering and molecule design. These +tasks typically involve black-box discrete sequence optimization, where the +challenge lies in generating sequences that are not only biologically feasible +but also adhere to hard fine-grained constraints. However, LLMs often struggle +with such constraints, especially in biological contexts where verifying +candidate solutions is costly and time-consuming. In this study, we explore the +possibility of employing LLMs as highly-constrained bilevel optimizers through +a methodology we refer to as Language Model Optimization with Margin +Expectation (LLOME). This approach combines both offline and online +optimization, utilizing limited oracle evaluations to iteratively enhance the +sequences generated by the LLM. We additionally propose a novel training +objective -- Margin-Aligned Expectation (MargE) -- that trains the LLM to +smoothly interpolate between the reward and reference distributions. Lastly, we +introduce a synthetic test suite that bears strong geometric similarity to real +biophysical problems and enables rapid evaluation of LLM optimizers without +time-consuming lab validation. Our findings reveal that, in comparison to +genetic algorithm baselines, LLMs achieve significantly lower regret solutions +while requiring fewer test function evaluations. However, we also observe that +LLMs exhibit moderate miscalibration, are susceptible to generator collapse, +and have difficulty finding the optimal solution when no explicit ground truth +rewards are available. + +
+
+ comment: Supercedes arXiv:2407.00236v1 +
+
+
+
+
+ + ♻ ☆ Exploring Behavior-Relevant and Disentangled Neural Dynamics with + Generative Diffusion Models + + +
+ Understanding the neural basis of behavior is a fundamental goal in +neuroscience. Current research in large-scale neuro-behavioral data analysis +often relies on decoding models, which quantify behavioral information in +neural data but lack details on behavior encoding. This raises an intriguing +scientific question: ``how can we enable in-depth exploration of neural +representations in behavioral tasks, revealing interpretable neural dynamics +associated with behaviors''. However, addressing this issue is challenging due +to the varied behavioral encoding across different brain regions and mixed +selectivity at the population level. To tackle this limitation, our approach, +named ``BeNeDiff'', first identifies a fine-grained and disentangled neural +subspace using a behavior-informed latent variable model. It then employs +state-of-the-art generative diffusion models to synthesize behavior videos that +interpret the neural dynamics of each latent factor. We validate the method on +multi-session datasets containing widefield calcium imaging recordings across +the dorsal cortex. Through guiding the diffusion model to activate individual +latent factors, we verify that the neural dynamics of latent factors in the +disentangled neural subspace provide interpretable quantifications of the +behaviors of interest. At the same time, the neural subspace in BeNeDiff +demonstrates high disentanglement and neural reconstruction quality. + +
+
+
+
+
+ + ♻ ☆ Large language models can be zero-shot anomaly detectors for time + series? + + +
+ Recent studies have shown the ability of large language models to perform a +variety of tasks, including time series forecasting. The flexible nature of +these models allows them to be used for many applications. In this paper, we +present a novel study of large language models used for the challenging task of +time series anomaly detection. This problem entails two aspects novel for LLMs: +the need for the model to identify part of the input sequence (or multiple +parts) as anomalous; and the need for it to work with time series data rather +than the traditional text input. We introduce sigllm, a framework for time +series anomaly detection using large language models. Our framework includes a +time-series-to-text conversion module, as well as end-to-end pipelines that +prompt language models to perform time series anomaly detection. We investigate +two paradigms for testing the abilities of large language models to perform the +detection task. First, we present a prompt-based detection method that directly +asks a language model to indicate which elements of the input are anomalies. +Second, we leverage the forecasting capability of a large language model to +guide the anomaly detection process. We evaluated our framework on 11 datasets +spanning various sources and 10 pipelines. We show that the forecasting method +significantly outperformed the prompting method in all 11 datasets with respect +to the F1 score. Moreover, while large language models are capable of finding +anomalies, state-of-the-art deep learning models are still superior in +performance, achieving results 30% better than large language models. + +
+
+ comment: This work is accepted by IEEE International Conference on Data + Science and Advanced Analytics (DSAA 2024) +
+
+
+
+
+ + ♻ ☆ Extended Reality for Enhanced Human-Robot Collaboration: a + Human-in-the-Loop Approach + + +
+ The rise of automation has provided an opportunity to achieve higher +efficiency in manufacturing processes, yet it often compromises the flexibility +required to promptly respond to evolving market needs and meet the demand for +customization. Human-robot collaboration attempts to tackle these challenges by +combining the strength and precision of machines with human ingenuity and +perceptual understanding. In this paper, we conceptualize and propose an +implementation framework for an autonomous, machine learning-based manipulator +that incorporates human-in-the-loop principles and leverages Extended Reality +(XR) to facilitate intuitive communication and programming between humans and +robots. Furthermore, the conceptual framework foresees human involvement +directly in the robot learning process, resulting in higher adaptability and +task generalization. The paper highlights key technologies enabling the +proposed framework, emphasizing the importance of developing the digital +ecosystem as a whole. Additionally, we review the existent implementation +approaches of XR in human-robot collaboration, showcasing diverse perspectives +and methodologies. The challenges and future outlooks are discussed, delving +into the major obstacles and potential research avenues of XR for more natural +human-robot interaction and integration in the industrial landscape. + +
+
+ comment: Published in IEEE International Conference on Robot and Human + Interactive Communication (RO-MAN) 2024 +
+
+
+
+
+ + ♻ ☆ User-Creator Feature Polarization in Recommender Systems with Dual + Influence NeurIPS 2024 + + +
+ Recommender systems serve the dual purpose of presenting relevant content to +users and helping content creators reach their target audience. The dual nature +of these systems naturally influences both users and creators: users' +preferences are affected by the items they are recommended, while creators may +be incentivized to alter their content to attract more users. We define a +model, called user-creator feature dynamics, to capture the dual influence of +recommender systems. We prove that a recommender system with dual influence is +guaranteed to polarize, causing diversity loss in the system. We then +investigate, both theoretically and empirically, approaches for mitigating +polarization and promoting diversity in recommender systems. Unexpectedly, we +find that common diversity-promoting approaches do not work in the presence of +dual influence, while relevancy-optimizing methods like top-$k$ truncation can +prevent polarization and improve diversity of the system. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Elliptical Attention NeurIPS 2024 + + +
+ Pairwise dot-product self-attention is key to the success of transformers +that achieve state-of-the-art performance across a variety of applications in +language and vision. This dot-product self-attention computes attention weights +among the input tokens using Euclidean distance, which makes the model prone to +representation collapse and vulnerable to contaminated samples. In this paper, +we propose using a Mahalanobis distance metric for computing the attention +weights to stretch the underlying feature space in directions of high +contextual relevance. In particular, we define a hyper-ellipsoidal neighborhood +around each query to increase the attention weights of the tokens lying in the +contextually important directions. We term this novel class of attention +Elliptical Attention. Our Elliptical Attention provides two benefits: 1) +reducing representation collapse and 2) enhancing the model's robustness as +Elliptical Attention pays more attention to contextually relevant information +rather than focusing on some small subset of informative features. We +empirically demonstrate the advantages of Elliptical Attention over the +baseline dot-product attention and state-of-the-art attention methods on +various practical tasks, including object classification, image segmentation, +and language modeling across different data modalities. + +
+
+ comment: 10 pages in the main text. Published at NeurIPS 2024. The code is + available at https://github.com/stefvk/Elliptical-Attention +
+
+
+
+
+ + ♻ ☆ WindsorML: High-Fidelity Computational Fluid Dynamics Dataset For + Automotive Aerodynamics + + +
+ This paper presents a new open-source high-fidelity dataset for Machine +Learning (ML) containing 355 geometric variants of the Windsor body, to help +the development and testing of ML surrogate models for external automotive +aerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a +GPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a +Cartesian immersed-boundary method using more than 280M cells to ensure the +greatest possible accuracy. The dataset contains geometry variants that +exhibits a wide range of flow characteristics that are representative of those +observed on road-cars. The dataset itself contains the 3D time-averaged volume +& boundary data as well as the geometry and force & moment coefficients. This +paper discusses the validation of the underlying CFD methods as well as +contents and structure of the dataset. To the authors knowledge, this +represents the first, large-scale high-fidelity CFD dataset for the Windsor +body with a permissive open-source license (CC-BY-SA). + +
+
+
+
+
+ + ♻ ☆ Federated Learning under Periodic Client Participation and Heterogeneous + Data: A New Communication-Efficient Algorithm and Analysis + + +
+ In federated learning, it is common to assume that clients are always +available to participate in training, which may not be feasible with user +devices in practice. Recent works analyze federated learning under more +realistic participation patterns, such as cyclic client availability or +arbitrary participation. However, all such works either require strong +assumptions (e.g., all clients participate almost surely within a bounded +window), do not achieve linear speedup and reduced communication rounds, or are +not applicable in the general non-convex setting. In this work, we focus on +nonconvex optimization and consider participation patterns in which the chance +of participation over a fixed window of rounds is equal among all clients, +which includes cyclic client availability as a special case. Under this +setting, we propose a new algorithm, named Amplified SCAFFOLD, and prove that +it achieves linear speedup, reduced communication, and resilience to data +heterogeneity simultaneously. In particular, for cyclic participation, our +algorithm is proved to enjoy $\mathcal{O}(\epsilon^{-2})$ communication rounds +to find an $\epsilon$-stationary point in the non-convex stochastic setting. In +contrast, the prior work under the same setting requires $\mathcal{O}(\kappa^2 +\epsilon^{-4})$ communication rounds, where $\kappa$ denotes the data +heterogeneity. Therefore, our algorithm significantly reduces communication +rounds due to better dependency in terms of $\epsilon$ and $\kappa$. Our +analysis relies on a fine-grained treatment of the nested dependence between +client participation and errors in the control variates, which results in +tighter guarantees than previous work. We also provide experimental results +with (1) synthetic data and (2) real-world data with a large number of clients +$(N = 250)$, demonstrating the effectiveness of our algorithm under periodic +client participation. + +
+
+ comment: Neurips 2024 +
+
+
+
+
+ + ♻ ☆ End-to-end streaming model for low-latency speech anonymization + + +
+ Speaker anonymization aims to conceal cues to speaker identity while +preserving linguistic content. Current machine learning based approaches +require substantial computational resources, hindering real-time streaming +applications. To address these concerns, we propose a streaming model that +achieves speaker anonymization with low latency. The system is trained in an +end-to-end autoencoder fashion using a lightweight content encoder that +extracts HuBERT-like information, a pretrained speaker encoder that extract +speaker identity, and a variance encoder that injects pitch and energy +information. These three disentangled representations are fed to a decoder that +re-synthesizes the speech signal. We present evaluation results from two +implementations of our system, a full model that achieves a latency of 230ms, +and a lite version (0.1x in size) that further reduces latency to 66ms while +maintaining state-of-the-art performance in naturalness, intelligibility, and +privacy preservation. + +
+
+
+
+
+ + ♻ ☆ Multi-Group Proportional Representation in Retrieval NeurIPS 2024 + + +
+ Image search and retrieval tasks can perpetuate harmful stereotypes, erase +cultural identities, and amplify social disparities. Current approaches to +mitigate these representational harms balance the number of retrieved items +across population groups defined by a small number of (often binary) +attributes. However, most existing methods overlook intersectional groups +determined by combinations of group attributes, such as gender, race, and +ethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel +metric that measures representation across intersectional groups. We develop +practical methods for estimating MPR, provide theoretical guarantees, and +propose optimization algorithms to ensure MPR in retrieval. We demonstrate that +existing methods optimizing for equal and proportional representation metrics +may fail to promote MPR. Crucially, our work shows that optimizing MPR yields +more proportional representation across multiple intersectional groups +specified by a rich function class, often with minimal compromise in retrieval +accuracy. + +
+
+ comment: 48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be + found at + https://github.com/alex-oesterling/multigroup-proportional-representation +
+
+
+
+
+ + ♻ ☆ MTLSO: A Multi-Task Learning Approach for Logic Synthesis Optimization + + +
+ Electronic Design Automation (EDA) is essential for IC design and has +recently benefited from AI-based techniques to improve efficiency. Logic +synthesis, a key EDA stage, transforms high-level hardware descriptions into +optimized netlists. Recent research has employed machine learning to predict +Quality of Results (QoR) for pairs of And-Inverter Graphs (AIGs) and synthesis +recipes. However, the severe scarcity of data due to a very limited number of +available AIGs results in overfitting, significantly hindering performance. +Additionally, the complexity and large number of nodes in AIGs make plain GNNs +less effective for learning expressive graph-level representations. To tackle +these challenges, we propose MTLSO - a Multi-Task Learning approach for Logic +Synthesis Optimization. On one hand, it maximizes the use of limited data by +training the model across different tasks. This includes introducing an +auxiliary task of binary multi-label graph classification alongside the primary +regression task, allowing the model to benefit from diverse supervision +sources. On the other hand, we employ a hierarchical graph representation +learning strategy to improve the model's capacity for learning expressive +graph-level representations of large AIGs, surpassing traditional plain GNNs. +Extensive experiments across multiple datasets and against state-of-the-art +baselines demonstrate the superiority of our method, achieving an average +performance gain of 8.22\% for delay and 5.95\% for area. + +
+
+
+
+
+ + ♻ ☆ E(3)-invariant diffusion model for pocket-aware peptide generation + + +
+ Biologists frequently desire protein inhibitors for a variety of reasons, +including use as research tools for understanding biological processes and +application to societal problems in agriculture, healthcare, etc. +Immunotherapy, for instance, relies on immune checkpoint inhibitors to block +checkpoint proteins, preventing their binding with partner proteins and +boosting immune cell function against abnormal cells. Inhibitor discovery has +long been a tedious process, which in recent years has been accelerated by +computational approaches. Advances in artificial intelligence now provide an +opportunity to make inhibitor discovery smarter than ever before. While +extensive research has been conducted on computer-aided inhibitor discovery, it +has mainly focused on either sequence-to-structure mapping, reverse mapping, or +bio-activity prediction, making it unrealistic for biologists to utilize such +tools. Instead, our work proposes a new method of computer-assisted inhibitor +discovery: de novo pocket-aware peptide structure and sequence generation +network. Our approach consists of two sequential diffusion models for +end-to-end structure generation and sequence prediction. By leveraging angle +and dihedral relationships between backbone atoms, we ensure an E(3)-invariant +representation of peptide structures. Our results demonstrate that our method +achieves comparable performance to state-of-the-art models, highlighting its +potential in pocket-aware peptide design. This work offers a new approach for +precise drug discovery using receptor-specific peptide generation. + +
+
+
+
+
+ + ♻ ☆ Domain-Adaptive Pre-training of Self-Supervised Foundation Models for + Medical Image Classification in Gastrointestinal Endoscopy + + +
+ Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE) +diagnostics by offering a non-invasive method for capturing detailed images of +the gastrointestinal tract, enabling early disease detection. However, its +potential is limited by the sheer volume of images generated during the imaging +procedure, which can take anywhere from 6-8 hours and often produce up to 1 +million images, necessitating automated analysis. Additionally, the variability +of these images, combined with the need for expert annotations and the scarcity +of large, high-quality labeled datasets, constrains the effectiveness of +current medical image analysis models. To address this, we introduce a novel +large gastrointestinal endoscopy dataset, called EndoExtend24, created by +merging and re-stratifying the train/test splits of ten existing public and +private datasets, ensuring no overlap of patient data across splits. +EndoExtend24 includes over 226,000 labeled images, as well as dynamic class +mappings, which allow unified training across datasets with differing labeling +granularity, supporting up to 123 distinct pathological findings. Further, we +propose to leverage domain adaptive pre-training of foundation models in +computer vision trained with self-supervision on generic image data, to adapt +them to the task of GIE medical diagnosis. Specifically, the EVA-02 model, +which is based on the vision transformer architecture and was trained on +ImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is +pre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and +finally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental +results demonstrate strong performance with an F1 score of 0.88, an improvement +of about 39% over the baseline model's F1 score of 0.49. Additionally, the +model achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%. + +
+
+
+
+
+ + ♻ ☆ Parallelizing Linear Transformers with the Delta Rule over Sequence + Length NeurIPS 2024 + + +
+ Transformers with linear attention (i.e., linear transformers) and +state-space models have recently been suggested as a viable linear-time +alternative to transformers with softmax attention. However, these models still +underperform transformers especially on tasks that require in-context +retrieval. While more expressive variants of linear transformers which replace +the additive update in linear transformers with the delta rule (DeltaNet) have +been found to be more effective at associative recall, existing algorithms for +training such models do not parallelize over sequence length and are thus +inefficient to train on modern hardware. This work describes a +hardware-efficient algorithm for training linear transformers with the delta +rule, which exploits a memory-efficient representation for computing products +of Householder matrices. This algorithm allows us to scale up DeltaNet to +standard language modeling settings. We train a 1.3B model for 100B tokens and +find that it outperforms recent linear-time baselines such as Mamba and GLA in +terms of perplexity and zero-shot performance on downstream tasks. We also +experiment with two hybrid models which combine DeltaNet layers with (1) +sliding-window attention layers every other layer or (2) two global attention +layers, and find that these hybrids outperform strong transformer baselines. + +
+
+ comment: NeurIPS 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations + for Vision Foundation Models ICML 2024 + + +
+ Vision transformers (ViTs) have emerged as a significant area of focus, +particularly for their capacity to be jointly trained with large language +models and to serve as robust vision foundation models. Yet, the development of +trustworthy explanation methods for ViTs has lagged, particularly in the +context of post-hoc interpretations of ViT predictions. Existing sub-image +selection approaches, such as feature-attribution and conceptual models, fall +short in this regard. This paper proposes five desiderata for explaining ViTs +-- faithfulness, stability, sparsity, multi-level structure, and parsimony -- +and demonstrates the inadequacy of current methods in meeting these criteria +comprehensively. We introduce a variational Bayesian explanation framework, +dubbed ProbAbilistic Concept Explainers (PACE), which models the distributions +of patch embeddings to provide trustworthy post-hoc conceptual explanations. +Our qualitative analysis reveals the distributions of patch-level concepts, +elucidating the effectiveness of ViTs by modeling the joint distribution of +patch embeddings and ViT's predictions. Moreover, these patch-level +explanations bridge the gap between image-level and dataset-level explanations, +thus completing the multi-level structure of PACE. Through extensive +experiments on both synthetic and real-world datasets, we demonstrate that PACE +surpasses state-of-the-art methods in terms of the defined desiderata. + +
+
+ comment: Proceedings of the 41st International Conference on Machine Learning + (ICML 2024) +
+
+
+
+
+ + ♻ ☆ A Simple Baseline for Predicting Events with Auto-Regressive Tabular + Transformers + + +
+ Many real-world applications of tabular data involve using historic events to +predict properties of new ones, for example whether a credit card transaction +is fraudulent or what rating a customer will assign a product on a retail +platform. Existing approaches to event prediction include costly, brittle, and +application-dependent techniques such as time-aware positional embeddings, +learned row and field encodings, and oversampling methods for addressing class +imbalance. Moreover, these approaches often assume specific use-cases, for +example that we know the labels of all historic events or that we only predict +a pre-specified label and not the data's features themselves. In this work, we +propose a simple but flexible baseline using standard autoregressive LLM-style +transformers with elementary positional embeddings and a causal language +modeling objective. Our baseline outperforms existing approaches across popular +datasets and can be employed for various use-cases. We demonstrate that the +same model can predict labels, impute missing values, or model event sequences. + +
+
+ comment: 10 pages, 6 pages of references+appendix +
+
+
+
+
+ + ♻ ☆ Inductive biases of multi-task learning and finetuning: multiple regimes + of feature reuse NeurIPS 2024 + + +
+ Neural networks are often trained on multiple tasks, either simultaneously +(multi-task learning, MTL) or sequentially (pretraining and subsequent +finetuning, PT+FT). In particular, it is common practice to pretrain neural +networks on a large auxiliary task before finetuning on a downstream task with +fewer samples. Despite the prevalence of this approach, the inductive biases +that arise from learning multiple tasks are poorly characterized. In this work, +we address this gap. We describe novel implicit regularization penalties +associated with MTL and PT+FT in diagonal linear networks and +single-hidden-layer ReLU networks. These penalties indicate that MTL and PT+FT +induce the network to reuse features in different ways. 1) Both MTL and PT+FT +exhibit biases towards feature reuse between tasks, and towards sparsity in the +set of learned features. We show a "conservation law" that implies a direct +tradeoff between these two biases. 2) PT+FT exhibits a novel "nested feature +selection" regime, not described by either the "lazy" or "rich" regimes +identified in prior work, which biases it to rely on a sparse subset of the +features learned during pretraining. This regime is much narrower for MTL. 3) +PT+FT (but not MTL) in ReLU networks benefits from features that are correlated +between the auxiliary and main task. We confirm these findings empirically with +teacher-student models, and introduce a technique -- weight rescaling following +pretraining -- that can elicit the nested feature selection regime. Finally, we +validate our theory in deep neural networks trained on image classification. We +find that weight rescaling improves performance when it causes models to +display signatures of nested feature selection. Our results suggest that nested +feature selection may be an important inductive bias for finetuning neural +networks. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Estimating the Hallucination Rate of Generative AI + + +
+ This paper presents a method for estimating the hallucination rate for +in-context learning (ICL) with generative AI. In ICL, a conditional generative +model (CGM) is prompted with a dataset and a prediction question and asked to +generate a response. One interpretation of ICL assumes that the CGM computes +the posterior predictive of an unknown Bayesian model, which implicitly defines +a joint distribution over observable datasets and latent mechanisms. This joint +distribution factorizes into two components: the model prior over mechanisms +and the model likelihood of datasets given a mechanism. With this perspective, +we define a hallucination as a generated response to the prediction question +with low model likelihood given the mechanism. We develop a new method that +takes an ICL problem and estimates the probability that a CGM will generate a +hallucination. Our method only requires generating prediction questions and +responses from the CGM and evaluating its response log probability. We +empirically evaluate our method using large language models for synthetic +regression and natural language ICL tasks. + +
+
+
+
+
+ + ♻ ☆ Symbolic Regression with a Learned Concept Library NeurIPS + + +
+ We present a novel method for symbolic regression (SR), the task of searching +for compact programmatic hypotheses that best explain a dataset. The problem is +commonly solved using genetic algorithms; we show that we can enhance such +methods by inducing a library of abstract textual concepts. Our algorithm, +called LaSR, uses zero-shot queries to a large language model (LLM) to discover +and evolve concepts occurring in known high-performing hypotheses. We discover +new hypotheses using a mix of standard evolutionary steps and LLM-guided steps +(obtained through zero-shot LLM queries) conditioned on discovered concepts. +Once discovered, hypotheses are used in a new round of concept abstraction and +evolution. We validate LaSR on the Feynman equations, a popular SR benchmark, +as well as a set of synthetic tasks. On these benchmarks, LaSR substantially +outperforms a variety of state-of-the-art SR approaches based on deep learning +and evolutionary algorithms. Moreover, we show that LaSR can be used to +discover a novel and powerful scaling law for LLMs. + +
+
+ comment: NeurIPS version; 10 pages; no checklist +
+
+
+
+
+ + ♻ ☆ FIARSE: Model-Heterogeneous Federated Learning via Importance-Aware + Submodel Extraction NeurIPS 2024 + + +
+ In federated learning (FL), accommodating clients' varied computational +capacities poses a challenge, often limiting the participation of those with +constrained resources in global model training. To address this issue, the +concept of model heterogeneity through submodel extraction has emerged, +offering a tailored solution that aligns the model's complexity with each +client's computational capacity. In this work, we propose Federated +Importance-Aware Submodel Extraction (FIARSE), a novel approach that +dynamically adjusts submodels based on the importance of model parameters, +thereby overcoming the limitations of previous static and dynamic submodel +extraction methods. Compared to existing works, the proposed method offers a +theoretical foundation for the submodel extraction and eliminates the need for +additional information beyond the model parameters themselves to determine +parameter importance, significantly reducing the overhead on clients. Extensive +experiments are conducted on various datasets to showcase the superior +performance of the proposed FIARSE. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Theoretical and Empirical Insights into the Origins of Degree Bias in + Graph Neural Networks NeurIPS 2024 + + +
+ Graph Neural Networks (GNNs) often perform better for high-degree nodes than +low-degree nodes on node classification tasks. This degree bias can reinforce +social marginalization by, e.g., privileging celebrities and other high-degree +actors in social networks during social and content recommendation. While +researchers have proposed numerous hypotheses for why GNN degree bias occurs, +we find via a survey of 38 degree bias papers that these hypotheses are often +not rigorously validated, and can even be contradictory. Thus, we provide an +analysis of the origins of degree bias in message-passing GNNs with different +graph filters. We prove that high-degree test nodes tend to have a lower +probability of misclassification regardless of how GNNs are trained. Moreover, +we show that degree bias arises from a variety of factors that are associated +with a node's degree (e.g., homophily of neighbors, diversity of neighbors). +Furthermore, we show that during training, some GNNs may adjust their loss on +low-degree nodes more slowly than on high-degree nodes; however, with +sufficiently many epochs of training, message-passing GNNs can achieve their +maximum possible training accuracy, which is not significantly limited by their +expressive power. Throughout our analysis, we connect our findings to +previously-proposed hypotheses for the origins of degree bias, supporting and +unifying some while drawing doubt to others. We validate our theoretical +findings on 8 common real-world networks, and based on our theoretical and +empirical insights, describe a roadmap to alleviate degree bias. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Taxonomy of Challenges to Curating Fair Datasets NeurIPS + + +
+ Despite extensive efforts to create fairer machine learning (ML) datasets, +there remains a limited understanding of the practical aspects of dataset +curation. Drawing from interviews with 30 ML dataset curators, we present a +comprehensive taxonomy of the challenges and trade-offs encountered throughout +the dataset curation lifecycle. Our findings underscore overarching issues +within the broader fairness landscape that impact data curation. We conclude +with recommendations aimed at fostering systemic changes to better facilitate +fair dataset curation practices. + +
+
+ comment: NeurIPS Datasets & Benchmarks 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven + Question Answering Pipeline + + +
+ Financial decision-making hinges on the analysis of relevant information +embedded in the enormous volume of documents in the financial domain. To +address this challenge, we developed FinQAPT, an end-to-end pipeline that +streamlines the identification of relevant financial reports based on a query, +extracts pertinent context, and leverages Large Language Models (LLMs) to +perform downstream tasks. To evaluate the pipeline, we experimented with +various techniques to optimize the performance of each module using the FinQA +dataset. We introduced a novel clustering-based negative sampling technique to +enhance context extraction and a novel prompting method called Dynamic N-shot +Prompting to boost the numerical question-answering capabilities of LLMs. At +the module level, we achieved state-of-the-art accuracy on FinQA, attaining an +accuracy of 80.6%. However, at the pipeline level, we observed decreased +performance due to challenges in extracting relevant context from financial +reports. We conducted a detailed error analysis of each module and the +end-to-end pipeline, pinpointing specific challenges that must be addressed to +develop a robust solution for handling complex financial tasks. + +
+
+ comment: Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ PowerGraph: A power grid benchmark dataset for graph neural networks + + +
+ Power grids are critical infrastructures of paramount importance to modern +society and, therefore, engineered to operate under diverse conditions and +failures. The ongoing energy transition poses new challenges for the +decision-makers and system operators. Therefore, developing grid analysis +algorithms is important for supporting reliable operations. These key tools +include power flow analysis and system security analysis, both needed for +effective operational and strategic planning. The literature review shows a +growing trend of machine learning (ML) models that perform these analyses +effectively. In particular, Graph Neural Networks (GNNs) stand out in such +applications because of the graph-based structure of power grids. However, +there is a lack of publicly available graph datasets for training and +benchmarking ML models in electrical power grid applications. First, we present +PowerGraph, which comprises GNN-tailored datasets for i) power flows, ii) +optimal power flows, and iii) cascading failure analyses of power grids. +Second, we provide ground-truth explanations for the cascading failure +analysis. Finally, we perform a complete benchmarking of GNN methods for +node-level and graph-level tasks and explainability. Overall, PowerGraph is a +multifaceted GNN dataset for diverse tasks that includes power flow and fault +scenarios with real-world explanations, providing a valuable resource for +developing improved GNN models for node-level, graph-level tasks and +explainability methods in power system modeling. The dataset is available at +https://figshare.com/articles/dataset/PowerGraph/22820534 and the code at +https://github.com/PowerGraph-Datasets. + +
+
+ comment: 21 pages, 8 figures, conference paper +
+
+
+
+
+ + ♻ ☆ Diffusion Policies creating a Trust Region for Offline Reinforcement + Learning NeurIPS 2024 + + +
+ Offline reinforcement learning (RL) leverages pre-collected datasets to train +optimal policies. Diffusion Q-Learning (DQL), introducing diffusion models as a +powerful and expressive policy class, significantly boosts the performance of +offline RL. However, its reliance on iterative denoising sampling to generate +actions slows down both training and inference. While several recent attempts +have tried to accelerate diffusion-QL, the improvement in training and/or +inference speed often results in degraded performance. In this paper, we +introduce a dual policy approach, Diffusion Trusted Q-Learning (DTQL), which +comprises a diffusion policy for pure behavior cloning and a practical one-step +policy. We bridge the two polices by a newly introduced diffusion trust region +loss. The diffusion policy maintains expressiveness, while the trust region +loss directs the one-step policy to explore freely and seek modes within the +region defined by the diffusion policy. DTQL eliminates the need for iterative +denoising sampling during both training and inference, making it remarkably +computationally efficient. We evaluate its effectiveness and algorithmic +characteristics against popular Kullback--Leibler divergence-based distillation +methods in 2D bandit scenarios and gym tasks. We then show that DTQL could not +only outperform other methods on the majority of the D4RL benchmark tasks but +also demonstrate efficiency in training and inference speeds. The PyTorch +implementation is available at +https://github.com/TianyuCodings/Diffusion_Trusted_Q_Learning. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Categorical Flow Matching on Statistical Manifolds NeurIPS 2024 + + +
+ We introduce Statistical Flow Matching (SFM), a novel and mathematically +rigorous flow-matching framework on the manifold of parameterized probability +measures inspired by the results from information geometry. We demonstrate the +effectiveness of our method on the discrete generation problem by instantiating +SFM on the manifold of categorical distributions whose geometric properties +remain unexplored in previous discrete generative models. Utilizing the Fisher +information metric, we equip the manifold with a Riemannian structure whose +intrinsic geometries are effectively leveraged by following the shortest paths +of geodesics. We develop an efficient training and sampling algorithm that +overcomes numerical stability issues with a diffeomorphism between manifolds. +Our distinctive geometric perspective of statistical manifolds allows us to +apply optimal transport during training and interpret SFM as following the +steepest direction of the natural gradient. Unlike previous models that rely on +variational bounds for likelihood estimation, SFM enjoys the exact likelihood +calculation for arbitrary probability measures. We manifest that SFM can learn +more complex patterns on the statistical manifold where existing models often +fail due to strong prior assumptions. Comprehensive experiments on real-world +generative tasks ranging from image, text to biological domains further +demonstrate that SFM achieves higher sampling quality and likelihood than other +discrete diffusion or flow-based models. + +
+
+ comment: Accepted to NeurIPS 2024 as a conference paper +
+
+
+
+
+ + ♻ ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ♻ ☆ NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and + Benchmarking NeurIPS 2024 + + +
+ Benchmarking vision-based driving policies is challenging. On one hand, +open-loop evaluation with real data is easy, but these results do not reflect +closed-loop performance. On the other, closed-loop evaluation is possible in +simulation, but is hard to scale due to its significant computational demands. +Further, the simulators available today exhibit a large domain gap to real +data. This has resulted in an inability to draw clear conclusions from the +rapidly growing body of research on end-to-end autonomous driving. In this +paper, we present NAVSIM, a middle ground between these evaluation paradigms, +where we use large datasets in combination with a non-reactive simulator to +enable large-scale real-world benchmarking. Specifically, we gather +simulation-based metrics, such as progress and time to collision, by unrolling +bird's eye view abstractions of the test scenes for a short simulation horizon. +Our simulation is non-reactive, i.e., the evaluated policy and environment do +not influence each other. As we demonstrate empirically, this decoupling allows +open-loop metric computation while being better aligned with closed-loop +evaluations than traditional displacement errors. NAVSIM enabled a new +competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting +in several new insights. On a large set of challenging scenarios, we observe +that simple methods with moderate compute requirements such as TransFuser can +match recent large-scale end-to-end driving architectures such as UniAD. Our +modular framework can potentially be extended with new datasets, data curation +strategies, and metrics, and will be continually maintained to host future +challenges. Our code is available at +https://github.com/autonomousvision/navsim. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self + Attention at the Threadblock Level NeurIPS 2024 + + +
+ Neighborhood attention reduces the cost of self attention by restricting each +token's attention span to its nearest neighbors. This restriction, +parameterized by a window size and dilation factor, draws a spectrum of +possible attention patterns between linear projection and self attention. +Neighborhood attention, and more generally sliding window attention patterns, +have long been bounded by infrastructure, particularly in higher-rank spaces +(2-D and 3-D), calling for the development of custom kernels, which have been +limited in either functionality, or performance, if not both. In this work, we +aim to massively improve upon existing infrastructure by providing two new +methods for implementing neighborhood attention. We first show that +neighborhood attention can be represented as a batched GEMM problem, similar to +standard attention, and implement it for 1-D and 2-D neighborhood attention. +These kernels on average provide 895% and 272% improvement in full precision +runtime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood +attention respectively. We find that aside from being heavily bound by memory +bandwidth, certain inherent inefficiencies exist in all unfused implementations +of neighborhood attention, which in most cases undo their theoretical +efficiency gain. Motivated by the progress made into fused dot-product +attention kernels, we developed fused neighborhood attention; an adaptation of +fused dot-product attention kernels that allow fine-grained control over +attention across different spatial axes. Known for reducing the quadratic time +complexity of self attention to a linear complexity, neighborhood attention can +now enjoy a reduced and constant memory footprint, and record-breaking half +precision runtime. We observe that our fused implementation successfully +circumvents some of the unavoidable inefficiencies in unfused +implementations... + +
+
+ comment: To appear in 38th Conference on Neural Information Processing Systems + (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Decision-focused predictions via pessimistic bilevel optimization: a + computational study + + +
+ Dealing with uncertainty in optimization parameters is an important and +longstanding challenge. Typically, uncertain parameters are predicted +accurately, and then a deterministic optimization problem is solved. However, +the decisions produced by this so-called \emph{predict-then-optimize} procedure +can be highly sensitive to uncertain parameters. In this work, we contribute to +recent efforts in producing \emph{decision-focused} predictions, i.e., to build +predictive models that are constructed with the goal of minimizing a +\emph{regret} measure on the decisions taken with them. We begin by formulating +the exact expected regret minimization as a pessimistic bilevel optimization +model. Then, we establish NP-completeness of this problem, even in a heavily +restricted case. Using duality arguments, we reformulate it as a non-convex +quadratic optimization problem. Finally, we show various computational +techniques to achieve tractability. We report extensive computational results +on shortest-path instances with uncertain cost vectors. Our results indicate +that our approach can improve training performance over the approach of +Elmachtoub and Grigas (2022), a state-of-the-art method for decision-focused +learning. + +
+
+ comment: We state in this version that: "To the best of our knowledge, no + hardness result for computing a regret-minimizing linear regression in this + context is known". However, in Elmachtoub and Grigas 2022, they show that + this is clearly a generalization of the 0-1 loss, that is NP-hard +
+
+
+
+
+ + ♻ ☆ Ab Initio Structure Solutions from Nanocrystalline Powder Diffraction + Data + + +
+ A major challenge in materials science is the determination of the structure +of nanometer sized objects. Here we present a novel approach that uses a +generative machine learning model based on diffusion processes that is trained +on 45,229 known structures. The model factors both the measured diffraction +pattern as well as relevant statistical priors on the unit cell of atomic +cluster structures. Conditioned only on the chemical formula and the +information-scarce finite-size broadened powder diffraction pattern, we find +that our model, PXRDnet, can successfully solve simulated nanocrystals as small +as 10 angstroms across 200 materials of varying symmetry and complexity, +including structures from all seven crystal systems. We show that our model can +successfully and verifiably determine structural candidates four out of five +times, with average error among these candidates being only 7% (as measured by +post-Rietveld refinement R-factor). Furthermore, PXRDnet is capable of solving +structures from noisy diffraction patterns gathered in real-world experiments. +We suggest that data driven approaches, bootstrapped from theoretical +simulation, will ultimately provide a path towards determining the structure of +previously unsolved nano-materials. + +
+
+
+
+
+ + ♻ ☆ TSI-Bench: Benchmarking Time Series Imputation + + +
+ Effective imputation is a crucial preprocessing step for time series +analysis. Despite the development of numerous deep learning algorithms for time +series imputation, the community lacks standardized and comprehensive benchmark +platforms to effectively evaluate imputation performance across different +settings. Moreover, although many deep learning forecasting algorithms have +demonstrated excellent performance, whether their modelling achievements can be +transferred to time series imputation tasks remains unexplored. To bridge these +gaps, we develop TSI-Bench, the first (to our knowledge) comprehensive +benchmark suite for time series imputation utilizing deep learning techniques. +The TSI-Bench pipeline standardizes experimental settings to enable fair +evaluation of imputation algorithms and identification of meaningful insights +into the influence of domain-appropriate missing rates and patterns on model +performance. Furthermore, TSI-Bench innovatively provides a systematic paradigm +to tailor time series forecasting algorithms for imputation purposes. Our +extensive study across 34,804 experiments, 28 algorithms, and 8 datasets with +diverse missingness scenarios demonstrates TSI-Bench's effectiveness in diverse +downstream tasks and potential to unlock future directions in time series +imputation research and analysis. All source code and experiment logs are +released at https://github.com/WenjieDu/AwesomeImputation. + +
+
+
+
+
+ + ♻ ☆ Debiasing Alternative Data for Credit Underwriting Using Causal + Inference + + +
+ Alternative data provides valuable insights for lenders to evaluate a +borrower's creditworthiness, which could help expand credit access to +underserved groups and lower costs for borrowers. But some forms of alternative +data have historically been excluded from credit underwriting because it could +act as an illegal proxy for a protected class like race or gender, causing +redlining. We propose a method for applying causal inference to a supervised +machine learning model to debias alternative data so that it might be used for +credit underwriting. We demonstrate how our algorithm can be used against a +public credit dataset to improve model accuracy across different racial groups, +while providing theoretically robust nondiscrimination guarantees. + +
+
+
+
+
+ + ♻ ☆ SPO: Sequential Monte Carlo Policy Optimisation NeurIPS 2024 + + +
+ Leveraging planning during learning and decision-making is central to the +long-term development of intelligent agents. Recent works have successfully +combined tree-based search methods and self-play learning mechanisms to this +end. However, these methods typically face scaling challenges due to the +sequential nature of their search. While practical engineering solutions can +partly overcome this, they often result in a negative impact on performance. In +this paper, we introduce SPO: Sequential Monte Carlo Policy Optimisation, a +model-based reinforcement learning algorithm grounded within the Expectation +Maximisation (EM) framework. We show that SPO provides robust policy +improvement and efficient scaling properties. The sample-based search makes it +directly applicable to both discrete and continuous action spaces without +modifications. We demonstrate statistically significant improvements in +performance relative to model-free and model-based baselines across both +continuous and discrete environments. Furthermore, the parallel nature of SPO's +search enables effective utilisation of hardware accelerators, yielding +favourable scaling laws. + +
+
+ comment: Accepted to NeurIPS 2024. 34 pages, 3 main figures +
+
+
+
+
+ + ♻ ☆ Implicit Optimization Bias of Next-Token Prediction in Linear Models + + +
+ We initiate an investigation into the optimization properties of next-token +prediction (NTP), the dominant training paradigm for modern language models. +Specifically, we study the structural properties of the solutions selected by +gradient-based optimizers among the many possible minimizers of the NTP +objective. By framing NTP as cross-entropy minimization across distinct +contexts, each tied with a sparse conditional probability distribution across a +finite vocabulary of tokens, we introduce "NTP-separability conditions" that +enable reaching the data-entropy lower bound. With this setup, and focusing on +linear models with fixed context embeddings, we characterize the optimization +bias of gradient descent (GD): Within the data subspace defined by the sparsity +patterns of distinct contexts, GD selects parameters that equate the logits' +differences of in-support tokens to their log-odds. In the orthogonal subspace, +the GD parameters diverge in norm and select the direction that maximizes a +margin specific to NTP. These findings extend previous research on implicit +bias in one-hot classification to the NTP setting, highlighting key differences +and prompting further research into the optimization and generalization +properties of NTP, irrespective of the specific architecture used to generate +the context embeddings. + +
+
+ comment: v2: fixed typos and writing in various parts; updated figures and + future-work section +
+
+
+
+
+ + ♻ ☆ On Statistical Rates and Provably Efficient Criteria of Latent Diffusion + Transformers (DiTs) NeurIPS 2024 + + +
+ We investigate the statistical and computational limits of latent Diffusion +Transformers (DiTs) under the low-dimensional linear latent space assumption. +Statistically, we study the universal approximation and sample complexity of +the DiTs score function, as well as the distribution recovery property of the +initial data. Specifically, under mild data assumptions, we derive an +approximation error bound for the score network of latent DiTs, which is +sub-linear in the latent space dimension. Additionally, we derive the +corresponding sample complexity bound and show that the data distribution +generated from the estimated score function converges toward a proximate area +of the original one. Computationally, we characterize the hardness of both +forward inference and backward computation of latent DiTs, assuming the Strong +Exponential Time Hypothesis (SETH). For forward inference, we identify +efficient criteria for all possible latent DiTs inference algorithms and +showcase our theory by pushing the efficiency toward almost-linear time +inference. For backward computation, we leverage the low-rank structure within +the gradient computation of DiTs training for possible algorithmic speedup. +Specifically, we show that such speedup achieves almost-linear time latent DiTs +training by casting the DiTs gradient as a series of chained low-rank +approximations with bounded error. Under the low-dimensional assumption, we +show that the statistical rates and the computational efficiency are all +dominated by the dimension of the subspace, suggesting that latent DiTs have +the potential to bypass the challenges associated with the high dimensionality +of initial data. + +
+
+ comment: Accepted at NeurIPS 2024. v3 updated to camera-ready version with + many typos fixed; v2 fixed typos, added Fig. 1 and added clarifications +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ 'No' Matters: Out-of-Distribution Detection in Multimodality Long + Dialogue + + +
+ Out-of-distribution (OOD) detection in multimodal contexts is essential for +identifying deviations in combined inputs from different modalities, +particularly in applications like open-domain dialogue systems or real-life +dialogue interactions. This paper aims to improve the user experience that +involves multi-round long dialogues by efficiently detecting OOD dialogues and +images. We introduce a novel scoring framework named Dialogue Image Aligning +and Enhancing Framework (DIAEF) that integrates the visual language models with +the novel proposed scores that detect OOD in two key scenarios (1) mismatches +between the dialogue and image input pair and (2) input pairs with previously +unseen labels. Our experimental results, derived from various benchmarks, +demonstrate that integrating image and multi-round dialogue OOD detection is +more effective with previously unseen labels than using either modality +independently. In the presence of mismatched pairs, our proposed score +effectively identifies these mismatches and demonstrates strong robustness in +long dialogues. This approach enhances domain-aware, adaptive conversational +agents and establishes baselines for future studies. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models + + +
+ Large Multimodal Models (LMMs) have demonstrated the ability to interact with +humans under real-world conditions by combining Large Language Models (LLMs) +and modality encoders to align multimodal information (visual and auditory) +with text. However, such models raise new safety challenges of whether models +that are safety-aligned on text also exhibit consistent safeguards for +multimodal inputs. Despite recent safety-alignment research on vision LMMs, the +safety of audio LMMs remains under-explored. In this work, we comprehensively +red team the safety of five advanced audio LMMs under three settings: (i) +harmful questions in both audio and text formats, (ii) harmful questions in +text format accompanied by distracting non-speech audio, and (iii) +speech-specific jailbreaks. Our results under these settings demonstrate that +open-source audio LMMs suffer an average attack success rate of 69.14% on +harmful audio questions, and exhibit safety vulnerabilities when distracted +with non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro +achieve an attack success rate of 70.67% on the harmful query benchmark. We +provide insights on what could cause these reported safety-misalignments. +Warning: this paper contains offensive examples. + +
+
+
+
+
+ + ☆ DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake + Detection + + +
+ With the advancement of deepfake generation techniques, the importance of +deepfake detection in protecting multimedia content integrity has become +increasingly obvious. Recently, temporal inconsistency clues have been explored +to improve the generalizability of deepfake video detection. According to our +observation, the temporal artifacts of forged videos in terms of motion +information usually exhibits quite distinct inconsistency patterns along +horizontal and vertical directions, which could be leveraged to improve the +generalizability of detectors. In this paper, a transformer-based framework for +Diffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits +directional inconsistencies for deepfake video detection. Specifically, DIP +begins with a spatiotemporal encoder to represent spatiotemporal information. A +directional inconsistency decoder is adopted accordingly, where direction-aware +attention and inconsistency diffusion are incorporated to explore potential +inconsistency patterns and jointly learn the inherent relationships. In +addition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to +contrast spatiotemporally augmented sample pairs and prevent the model from +overfitting nonessential forgery artifacts. Extensive experiments on several +public datasets demonstrate that our method could effectively identify +directional forgery clues and achieve state-of-the-art performance. + +
+
+ comment: 13 pages, accepted with IEEE Trans. on Multimedia +
+
+
+
+
+ + ♻ ☆ Aligning Audio-Visual Joint Representations with an Agentic Workflow + + +
+ Visual content and accompanied audio signals naturally formulate a joint +representation to improve audio-visual (AV) related applications. While studies +develop various AV representation learning frameworks, the importance of AV +data alignment is usually undermined for achieving high-quality representation. +We observe that an audio signal may contain background noise interference. +Also, non-synchronization may appear between audio and video streams. These +non-strict data alignment limits representation quality and downgrade +application performance. In this paper, we propose to improve AV joint +representations from a data-centric perspective by aligning audio signals to +visual data. Our alignment is conducted in an agentic workflow controlled by an +LLM-based assistant named AVAgent. For each input AV data pair, our AVAgent +uses a multi-modal LLM to convert audio and visual data into language +descriptions separately (i.e., tool use). Then, AVAgent reasons whether this +paired data is aligned well and plans to edit the audio signal if needed (i.e., +planning). The audio editing is executed by predefined actions that filter +noise or augment data. Moreover, we use a VLM to evaluate how modified audio +signals match the visual content and provide feedback to AVAgent (i.e., +reflection). The tool use, planning, and reflection steps operate cyclically to +become an agentic workflow where audio signals are gradually aligned to visual +content. To this end, existing methods can directly leverage the aligned AV +data via our agentic workflow to improve AV joint representations. The +experimental results comprehensively demonstrate the state-of-the-art +performance of the proposed approach against previous baselines in diverse +downstream tasks. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 21 + +
+
+
+ + ☆ Mind the Gap: A Generalized Approach for Cross-Modal Embedding Alignment + + +
+ Retrieval-Augmented Generation (RAG) systems enhance text generation by +incorporating external knowledge but often struggle when retrieving context +across different text modalities due to semantic gaps. We introduce a +generalized projection-based method, inspired by adapter modules in transfer +learning, that efficiently bridges these gaps between various text types, such +as programming code and pseudocode, or English and French sentences. Our +approach emphasizes speed, accuracy, and data efficiency, requiring minimal +resources for training and inference. By aligning embeddings from heterogeneous +text modalities into a unified space through a lightweight projection network, +our model significantly outperforms traditional retrieval methods like the +Okapi BM25 algorithm and models like Dense Passage Retrieval (DPR), while +approaching the accuracy of Sentence Transformers. Extensive evaluations +demonstrate the effectiveness and generalizability of our method across +different tasks, highlighting its potential for real-time, resource-constrained +applications. + +
+
+ comment: 18 pages, 3 figures +
+
+
+
+
+ + ☆ ReasoningRec: Bridging Personalized Recommendations and + Human-Interpretable Explanations through LLM Reasoning + + +
+ This paper presents ReasoningRec, a reasoning-based recommendation framework +that leverages Large Language Models (LLMs) to bridge the gap between +recommendations and human-interpretable explanations. In contrast to +conventional recommendation systems that rely on implicit user-item +interactions, ReasoningRec employs LLMs to model users and items, focusing on +preferences, aversions, and explanatory reasoning. The framework utilizes a +larger LLM to generate synthetic explanations for user preferences, +subsequently used to fine-tune a smaller LLM for enhanced recommendation +accuracy and human-interpretable explanation. Our experimental study +investigates the impact of reasoning and contextual information on personalized +recommendations, revealing that the quality of contextual and personalized data +significantly influences the LLM's capacity to generate plausible explanations. +Empirical evaluations demonstrate that ReasoningRec surpasses state-of-the-art +methods by up to 12.5\% in recommendation prediction while concurrently +providing human-intelligible explanations. The code is available here: +https://github.com/millenniumbismay/reasoningrec. + +
+
+ comment: Large Language Model, Recommendation, Human-Interpretable Reasoning, + Personalization +
+
+
+
+
+ + ☆ SciPIP: An LLM-based Scientific Paper Idea Proposer + + +
+ The exponential growth of knowledge and the increasing complexity of +interdisciplinary research pose significant challenges for researchers, +including information overload and difficulties in exploring novel ideas. The +advancements in large language models (LLMs), such as GPT-4, have shown great +potential in enhancing idea proposals, but how to effectively utilize large +models for reasonable idea proposal has not been thoroughly explored. This +paper proposes a scientific paper idea proposer (SciPIP). Based on a +user-provided research background, SciPIP retrieves helpful papers from a +literature database while leveraging the capabilities of LLMs to generate more +novel and feasible ideas. To this end, 1) we construct a literature retrieval +database, extracting lots of papers' multi-dimension information for fast +access. Then, a literature retrieval method based on semantics, entity, and +citation co-occurrences is proposed to search relevant literature from multiple +aspects based on the user-provided background. 2) After literature retrieval, +we introduce dual-path idea proposal strategies, where one path infers +solutions from the retrieved literature and the other path generates original +ideas through model brainstorming. We then combine the two to achieve a good +balance between feasibility and originality. Through extensive experiments on +the natural language processing (NLP) field, we demonstrate that SciPIP can +retrieve citations similar to those of existing top conference papers and +generate many ideas consistent with them. Additionally, we evaluate the +originality of other ideas generated by SciPIP using large language models, +further validating the effectiveness of our proposed method. The code and the +database are released at https://github.com/cheerss/SciPIP. + +
+
+ comment: 25 pages, 5 figures, 19 tables +
+
+
+
+
+ + ☆ Real-Time Personalization for LLM-based Recommendation with Customized + In-Context Learning + + +
+ Frequently updating Large Language Model (LLM)-based recommender systems to +adapt to new user interests -- as done for traditional ones -- is impractical +due to high training costs, even with acceleration methods. This work explores +adapting to dynamic user interests without any model updates by leveraging +In-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot +examples provided in the input. Using new-interest examples as the ICL few-shot +examples, LLMs may learn real-time interest directly, avoiding the need for +model updates. However, existing LLM-based recommenders often lose the +in-context learning ability during recommendation tuning, while the original +LLM's in-context learning lacks recommendation-specific focus. To address this, +we propose RecICL, which customizes recommendation-specific in-context learning +for real-time recommendations. RecICL organizes training examples in an +in-context learning format, ensuring that in-context learning ability is +preserved and aligned with the recommendation task during tuning. + Extensive experiments demonstrate RecICL's effectiveness in delivering +real-time recommendations without requiring model updates. Our code is +available at https://github.com/ym689/rec_icl. + +
+
+
+
+
+ + ☆ CORAL: Benchmarking Multi-turn Conversational Retrieval-Augmentation + Generation + + +
+ Retrieval-Augmented Generation (RAG) has become a powerful paradigm for +enhancing large language models (LLMs) through external knowledge retrieval. +Despite its widespread attention, existing academic research predominantly +focuses on single-turn RAG, leaving a significant gap in addressing the +complexities of multi-turn conversations found in real-world applications. To +bridge this gap, we introduce CORAL, a large-scale benchmark designed to assess +RAG systems in realistic multi-turn conversational settings. CORAL includes +diverse information-seeking conversations automatically derived from Wikipedia +and tackles key challenges such as open-domain coverage, knowledge intensity, +free-form responses, and topic shifts. It supports three core tasks of +conversational RAG: passage retrieval, response generation, and citation +labeling. We propose a unified framework to standardize various conversational +RAG methods and conduct a comprehensive evaluation of these methods on CORAL, +demonstrating substantial opportunities for improving existing approaches. + +
+
+
+
+
+ + ☆ A Universal Sets-level Optimization Framework for Next Set + Recommendation CIKM2024 + + +
+ Next Set Recommendation (NSRec), encompassing related tasks such as next +basket recommendation and temporal sets prediction, stands as a trending +research topic. Although numerous attempts have been made on this topic, there +are certain drawbacks: (i) Existing studies are still confined to utilizing +objective functions commonly found in Next Item Recommendation (NIRec), such as +binary cross entropy and BPR, which are calculated based on individual item +comparisons; (ii) They place emphasis on building sophisticated learning models +to capture intricate dependency relationships across sequential sets, but +frequently overlook pivotal dependency in their objective functions; (iii) +Diversity factor within sequential sets is frequently overlooked. In this +research, we endeavor to unveil a universal and S ets-level optimization +framework for N ext Set Recommendation (SNSRec), offering a holistic fusion of +diversity distribution and intricate dependency relationships within temporal +sets. To realize this, the following contributions are made: (i) We directly +model the temporal set in a sequence as a cohesive entity, leveraging the +Structured Determinantal Point Process (SDPP), wherein the probabilistic DPP +distribution prioritizes collections of structures (sequential sets) instead of +individual items; (ii) We introduce a co-occurrence representation to discern +and acknowledge the importance of different sets; (iii) We propose a sets-level +optimization criterion, which integrates the diversity distribution and +dependency relations across the entire sequence of sets, guiding the model to +recommend relevant and diversified set. Extensive experiments on real-world +datasets show that our approach consistently outperforms previous methods on +both relevance and diversity. + +
+
+ comment: Accepter at CIKM2024 +
+
+
+
+
+ + ☆ DataRec: A Framework for Standardizing Recommendation Data Processing + and Analysis + + +
+ Thanks to the great interest posed by researchers and companies, +recommendation systems became a cornerstone of machine learning applications. +However, concerns have arisen recently about the need for reproducibility, +making it challenging to identify suitable pipelines. Several frameworks have +been proposed to improve reproducibility, covering the entire process from data +reading to performance evaluation. Despite this effort, these solutions often +overlook the role of data management, do not promote interoperability, and +neglect data analysis despite its well-known impact on recommender performance. +To address these gaps, we propose DataRec, which facilitates using and +manipulating recommendation datasets. DataRec supports reading and writing in +various formats, offers filtering and splitting techniques, and enables data +distribution analysis using well-known metrics. It encourages a unified +approach to data manipulation by allowing data export in formats compatible +with several recommendation frameworks. + +
+
+
+
+
+ + ☆ Understanding and Improving Adversarial Collaborative Filtering for + Robust Recommendation + + +
+ Adversarial Collaborative Filtering (ACF), which typically applies +adversarial perturbations at user and item embeddings through adversarial +training, is widely recognized as an effective strategy for enhancing the +robustness of Collaborative Filtering (CF) recommender systems against +poisoning attacks. Besides, numerous studies have empirically shown that ACF +can also improve recommendation performance compared to traditional CF. Despite +these empirical successes, the theoretical understanding of ACF's effectiveness +in terms of both performance and robustness remains unclear. To bridge this +gap, in this paper, we first theoretically show that ACF can achieve a lower +recommendation error compared to traditional CF with the same training epochs +in both clean and poisoned data contexts. Furthermore, by establishing bounds +for reductions in recommendation error during ACF's optimization process, we +find that applying personalized magnitudes of perturbation for different users +based on their embedding scales can further improve ACF's effectiveness. +Building on these theoretical understandings, we propose Personalized Magnitude +Adversarial Collaborative Filtering (PamaCF). Extensive experiments demonstrate +that PamaCF effectively defends against various types of poisoning attacks +while significantly enhancing recommendation performance. + +
+
+
+
+
+ + ☆ HijackRAG: Hijacking Attacks against Retrieval-Augmented Large Language + Models + + +
+ Retrieval-Augmented Generation (RAG) systems enhance large language models +(LLMs) by integrating external knowledge, making them adaptable and +cost-effective for various applications. However, the growing reliance on these +systems also introduces potential security risks. In this work, we reveal a +novel vulnerability, the retrieval prompt hijack attack (HijackRAG), which +enables attackers to manipulate the retrieval mechanisms of RAG systems by +injecting malicious texts into the knowledge database. When the RAG system +encounters target questions, it generates the attacker's pre-determined answers +instead of the correct ones, undermining the integrity and trustworthiness of +the system. We formalize HijackRAG as an optimization problem and propose both +black-box and white-box attack strategies tailored to different levels of the +attacker's knowledge. Extensive experiments on multiple benchmark datasets show +that HijackRAG consistently achieves high attack success rates, outperforming +existing baseline attacks. Furthermore, we demonstrate that the attack is +transferable across different retriever models, underscoring the widespread +risk it poses to RAG systems. Lastly, our exploration of various defense +mechanisms reveals that they are insufficient to counter HijackRAG, emphasizing +the urgent need for more robust security measures to protect RAG systems in +real-world deployments. + +
+
+
+
+
+ + ☆ Causality-Enhanced Behavior Sequence Modeling in LLMs for Personalized + Recommendation + + +
+ Recent advancements in recommender systems have focused on leveraging Large +Language Models (LLMs) to improve user preference modeling, yielding promising +outcomes. However, current LLM-based approaches struggle to fully leverage user +behavior sequences, resulting in suboptimal preference modeling for +personalized recommendations. In this study, we propose a novel Counterfactual +Fine-Tuning (CFT) method to address this issue by explicitly emphasizing the +role of behavior sequences when generating recommendations. Specifically, we +employ counterfactual reasoning to identify the causal effects of behavior +sequences on model output and introduce a task that directly fits the +ground-truth labels based on these effects, achieving the goal of explicit +emphasis. Additionally, we develop a token-level weighting mechanism to adjust +the emphasis strength for different item tokens, reflecting the diminishing +influence of behavior sequences from earlier to later tokens during predicting +an item. Extensive experiments on real-world datasets demonstrate that CFT +effectively improves behavior sequence modeling. Our codes are available at +https://github.com/itsmeyjt/CFT. + +
+
+
+
+
+ + ☆ Dual Contrastive Transformer for Hierarchical Preference Modeling in + Sequential Recommendation + + +
+ Sequential recommender systems (SRSs) aim to predict the subsequent items +which may interest users via comprehensively modeling users' complex preference +embedded in the sequence of user-item interactions. However, most of existing +SRSs often model users' single low-level preference based on item ID +information while ignoring the high-level preference revealed by item attribute +information, such as item category. Furthermore, they often utilize limited +sequence context information to predict the next item while overlooking richer +inter-item semantic relations. To this end, in this paper, we proposed a novel +hierarchical preference modeling framework to substantially model the complex +low- and high-level preference dynamics for accurate sequential recommendation. +Specifically, in the framework, a novel dual-transformer module and a novel +dual contrastive learning scheme have been designed to discriminatively learn +users' low- and high-level preference and to effectively enhance both low- and +high-level preference learning respectively. In addition, a novel +semantics-enhanced context embedding module has been devised to generate more +informative context embedding for further improving the recommendation +performance. Extensive experiments on six real-world datasets have demonstrated +both the superiority of our proposed method over the state-of-the-art ones and +the rationality of our design. + +
+
+
+
+
+ + ♻ ☆ Modern Hopfield Networks meet Encoded Neural Representations -- + Addressing Practical Considerations + + +
+ Content-addressable memories such as Modern Hopfield Networks (MHN) have been +studied as mathematical models of auto-association and storage/retrieval in the +human declarative memory, yet their practical use for large-scale content +storage faces challenges. Chief among them is the occurrence of meta-stable +states, particularly when handling large amounts of high dimensional content. +This paper introduces Hopfield Encoding Networks (HEN), a framework that +integrates encoded neural representations into MHNs to improve pattern +separability and reduce meta-stable states. We show that HEN can also be used +for retrieval in the context of hetero association of images with natural +language queries, thus removing the limitation of requiring access to partial +content in the same domain. Experimental results demonstrate substantial +reduction in meta-stable states and increased storage capacity while still +enabling perfect recall of a significantly larger number of inputs advancing +the practical utility of associative memory networks for real-world tasks. + +
+
+ comment: 17 pages, 8 figures, accepted as a workshop paper at UniReps @ + Neurips 2024 +
+
+
+
+
+ + ♻ ☆ INDUS: Effective and Efficient Language Models for Scientific + Applications EMNLP 2024 + + +
+ Large language models (LLMs) trained on general domain corpora showed +remarkable results on natural language processing (NLP) tasks. However, +previous research demonstrated LLMs trained using domain-focused corpora +perform better on specialized tasks. Inspired by this insight, we developed +INDUS, a comprehensive suite of LLMs tailored for the closely-related domains +of Earth science, biology, physics, heliophysics, planetary sciences and +astrophysics, and trained using curated scientific corpora drawn from diverse +data sources. The suite of models include: (1) an encoder model trained using +domain-specific vocabulary and corpora to address NLP tasks, (2) a +contrastive-learning based text embedding model trained using a diverse set of +datasets to address information retrieval tasks and (3) smaller versions of +these models created using knowledge distillation for applications which have +latency or resource constraints. We also created three new scientific benchmark +datasets, CLIMATE-CHANGE NER (entity-recognition), NASA-QA (extractive QA) and +NASA-IR (IR) to accelerate research in these multi-disciplinary fields. We show +that our models outperform both general-purpose (RoBERTa) and domain-specific +(SCIBERT) encoders on these new tasks as well as existing tasks in the domains +of interest. Furthermore, we demonstrate the use of these models in two +industrial settings -- as a retrieval model for large-scale vector search +applications and in automatic content tagging systems. + +
+
+ comment: EMNLP 2024 (Industry Track) +
+
+
+
+
+ + ♻ ☆ Retention Induced Biases in a Recommendation System with Heterogeneous + Users + + +
+ I examine a conceptual model of a recommendation system (RS) with user inflow +and churn dynamics. When inflow and churn balance out, the user distribution +reaches a steady state. Changing the recommendation algorithm alters the steady +state and creates a transition period. During this period, the RS behaves +differently from its new steady state. In particular, A/B experiment metrics +obtained in transition periods are biased indicators of the RS's long-term +performance. Scholars and practitioners, however, often conduct A/B tests +shortly after introducing new algorithms to validate their effectiveness. This +A/B experiment paradigm, widely regarded as the gold standard for assessing RS +improvements, may consequently yield false conclusions. I also briefly touch on +the data bias caused by the user retention dynamics. + +
+
+ comment: This preprint has not undergone peer review (when applicable) or any + post-submission improvements or corrections. The Version of Record of this + contribution is published in advances in Bias and Fairness in Information + Retrieval. BIAS 2024. Communications in Computer and Information Science, vol + 2227. Springer, and is available online at + https://doi.org/10.1007/978-3-031-71975-2_2 +
+
+
+
+
+ + ♻ ☆ Scientific and Technological Information Oriented Semantics-adversarial + and Media-adversarial Cross-media Retrieval + + +
+ Cross-media retrieval of scientific and technological information is one of +the important tasks in the cross-media study. Cross-media scientific and +technological information retrieval obtain target information from massive +multi-source and heterogeneous scientific and technological resources, which +helps to design applications that meet users' needs, including scientific and +technological information recommendation, personalized scientific and +technological information retrieval, etc. The core of cross-media retrieval is +to learn a common subspace, so that data from different media can be directly +compared with each other after being mapped into this subspace. In subspace +learning, existing methods often focus on modeling the discrimination of +intra-media data and the invariance of inter-media data after mapping; however, +they ignore the semantic consistency of inter-media data before and after +mapping and media discrimination of intra-semantics data, which limit the +result of cross-media retrieval. In light of this, we propose a scientific and +technological information oriented Semantics-adversarial and Media-adversarial +Cross-media Retrieval method (SMCR) to find an effective common subspace. +Specifically, SMCR minimizes the loss of inter-media semantic consistency in +addition to modeling intra-media semantic discrimination, to preserve semantic +similarity before and after mapping. Furthermore, SMCR constructs a basic +feature mapping network and a refined feature mapping network to jointly +minimize the media discriminative loss within semantics, so as to enhance the +feature mapping network's ability to confuse the media discriminant network. +Experimental results on two datasets demonstrate that the proposed SMCR +outperforms state-of-the-art methods in cross-media retrieval. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ FLIP: Fine-grained Alignment between ID-based Models and Pretrained + Language Models for CTR Prediction RecSys 2024 + + +
+ Click-through rate (CTR) prediction plays as a core function module in +various personalized online services. The traditional ID-based models for CTR +prediction take as inputs the one-hot encoded ID features of tabular modality, +which capture the collaborative signals via feature interaction modeling. But +the one-hot encoding discards the semantic information included in the textual +features. Recently, the emergence of Pretrained Language Models(PLMs) has given +rise to another paradigm, which takes as inputs the sentences of textual +modality obtained by hard prompt templates and adopts PLMs to extract the +semantic knowledge. However, PLMs often face challenges in capturing field-wise +collaborative signals and distinguishing features with subtle textual +differences. In this paper, to leverage the benefits of both paradigms and +meanwhile overcome their limitations, we propose to conduct Fine-grained +feature-level ALignment between ID-based Models and Pretrained Language +Models(FLIP) for CTR prediction. Unlike most methods that solely rely on global +views through instance-level contrastive learning, we design a novel jointly +masked tabular/language modeling task to learn fine-grained alignment between +tabular IDs and word tokens. Specifically, the masked data of one modality (IDs +and tokens) has to be recovered with the help of the other modality, which +establishes the feature-level interaction and alignment via sufficient mutual +information extraction between dual modalities. Moreover, we propose to jointly +finetune the ID-based model and PLM by adaptively combining the output of both +models, thus achieving superior performance in downstream CTR prediction tasks. +Extensive experiments on three real-world datasets demonstrate that FLIP +outperforms SOTA baselines, and is highly compatible with various ID-based +models and PLMs. The code is at \url{https://github.com/justarter/FLIP}. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ♻ ☆ R^2AG: Incorporating Retrieval Information into Retrieval Augmented + Generation EMNLP 2024 + + +
+ Retrieval augmented generation (RAG) has been applied in many scenarios to +augment large language models (LLMs) with external documents provided by +retrievers. However, a semantic gap exists between LLMs and retrievers due to +differences in their training objectives and architectures. This misalignment +forces LLMs to passively accept the documents provided by the retrievers, +leading to incomprehension in the generation process, where the LLMs are +burdened with the task of distinguishing these documents using their inherent +knowledge. This paper proposes R$^2$AG, a novel enhanced RAG framework to fill +this gap by incorporating Retrieval information into Retrieval Augmented +Generation. Specifically, R$^2$AG utilizes the nuanced features from the +retrievers and employs a R$^2$-Former to capture retrieval information. Then, a +retrieval-aware prompting strategy is designed to integrate retrieval +information into LLMs' generation. Notably, R$^2$AG suits low-source scenarios +where LLMs and retrievers are frozen. Extensive experiments across five +datasets validate the effectiveness, robustness, and efficiency of R$^2$AG. Our +analysis reveals that retrieval information serves as an anchor to aid LLMs in +the generation process, thereby filling the semantic gap. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ CIDGMed: Causal Inference-Driven Medication Recommendation with Enhanced + Dual-Granularity Learning + + +
+ Medication recommendation aims to integrate patients' long-term health +records to provide accurate and safe medication combinations for specific +health states. Existing methods often fail to deeply explore the true causal +relationships between diseases/procedures and medications, resulting in biased +recommendations. Additionally, in medication representation learning, the +relationships between information at different granularities of medications, +coarse-grained (medication itself) and fine-grained (molecular level), are not +effectively integrated, leading to biases in representation learning. To +address these limitations, we propose the Causal Inference-driven +Dual-Granularity Medication Recommendation method (CIDGMed). Our approach +leverages causal inference to uncover the relationships between +diseases/procedures and medications, thereby enhancing the rationality and +interpretability of recommendations. By integrating coarse-grained medication +effects with fine-grained molecular structure information, CIDGMed provides a +comprehensive representation of medications. Additionally, we employ a bias +correction model during the prediction phase to further refine recommendations, +ensuring both accuracy and safety. Through extensive experiments, CIDGMed +significantly outperforms current state-of-the-art models across multiple +metrics, achieving a 2.54% increase in accuracy, a 3.65% reduction in side +effects, and a 39.42% improvement in time efficiency. Additionally, we +demonstrate the rationale of CIDGMed through a case study. + +
+
+
+
+
+ + ♻ ☆ ChatQA: Surpassing GPT-4 on Conversational QA and RAG NeurIPS 2024 + + +
+ In this work, we introduce ChatQA, a suite of models that outperform GPT-4 on +retrieval-augmented generation (RAG) and conversational question answering +(QA). To enhance generation, we propose a two-stage instruction tuning method +that significantly boosts the performance of RAG. For effective retrieval, we +introduce a dense retriever optimized for conversational QA, which yields +results comparable to the alternative state-of-the-art query rewriting models, +while substantially reducing deployment costs. We also present the ChatRAG +Bench, which encompasses ten datasets covering comprehensive evaluations on +RAG, table-related QA, arithmetic calculations, and scenarios involving +unanswerable questions. Our ChatQA-1.0-70B (score: 54.14), built on Llama2, a +weaker foundation model than GPT-4, can slightly outperform GPT-4-0613 (score: +53.90) and GPT-4-Turbo-2024-04-09 (score: 54.03) on the ChatRAG Bench, without +relying on any synthetic data from OpenAI GPT models. Notably, the +Llama3-ChatQA-1.5-70B model surpasses the accuracy of GPT-4-Turbo-2024-04-09, +achieving a 4.4% improvement. To advance research in this field, we +open-sourced the model weights, instruction tuning data, ChatRAG Bench, and +retriever for the community: https://chatqa-project.github.io/. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ AugTriever: Unsupervised Dense Retrieval and Domain Adaptation by + Scalable Data Augmentation + + +
+ Dense retrievers have made significant strides in text retrieval and +open-domain question answering. However, most of these achievements have relied +heavily on extensive human-annotated supervision. In this study, we aim to +develop unsupervised methods for improving dense retrieval models. We propose +two approaches that enable annotation-free and scalable training by creating +pseudo querydocument pairs: query extraction and transferred query generation. +The query extraction method involves selecting salient spans from the original +document to generate pseudo queries. On the other hand, the transferred query +generation method utilizes generation models trained for other NLP tasks, such +as summarization, to produce pseudo queries. Through extensive experimentation, +we demonstrate that models trained using these augmentation methods can achieve +comparable, if not better, performance than multiple strong dense baselines. +Moreover, combining these strategies leads to further improvements, resulting +in superior performance of unsupervised dense retrieval, unsupervised domain +adaptation and supervised finetuning, benchmarked on both BEIR and ODQA +datasets. Code and datasets are publicly available at +https://github.com/salesforce/AugTriever. + +
+
+ comment: DCAI24, October 25, 2024, Boise, ID +
+
+
+
+
+ + ♻ ☆ Summarization-Based Document IDs for Generative Retrieval with Language + Models EMNLP 2024 + + +
+ Generative retrieval (Wang et al., 2022; Tay et al., 2022) is a popular +approach for end-to-end document retrieval that directly generates document +identifiers given an input query. We introduce summarization-based document +IDs, in which each document's ID is composed of an extractive summary or +abstractive keyphrases generated by a language model, rather than an integer ID +sequence or bags of n-grams as proposed in past work. We find that abstractive, +content-based IDs (ACID) and an ID based on the first 30 tokens are very +effective in direct comparisons with previous approaches to ID creation. We +show that using ACID improves top-10 and top-20 recall by 15.6% and 14.4% +(relative) respectively versus the cluster-based integer ID baseline on the +MSMARCO 100k retrieval task, and 9.8% and 9.9% respectively on the +Wikipedia-based NQ 100k retrieval task. Our results demonstrate the +effectiveness of human-readable, natural-language IDs created through +summarization for generative retrieval. We also observed that extractive IDs +outperformed abstractive IDs on Wikipedia articles in NQ but not the snippets +in MSMARCO, which suggests that document characteristics affect generative +retrieval performance. + +
+
+ comment: To appear at the NLP for Wikipedia Workshop in EMNLP 2024 +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ The Trail Making Test in Virtual Reality (TMT-VR): The Effects of + Interaction Modes and Gaming Skills on Cognitive Performance of Young Adults + + +
+ Virtual Reality (VR) is increasingly used in neuropsychological assessments +due to its ability to simulate real-world environments. This study aimed to +develop and evaluate the Trail Making Test in VR (TMT-VR) and investigate the +effects of different interaction modes and gaming skills on cognitive +performance. A total of 71 young female and male adults (aged 18-35) with high +and low gaming skills participated in this study. Participants completed the +TMT-VR using three interaction modes as follows: eye tracking, head movement, +and controller. Performance metrics included task completion time and accuracy. +User experience, usability, and acceptability of TMT-VR were also examined. +Results showed that both eye tracking and head movement modes significantly +outperformed the controller in terms of task completion time and accuracy. No +significant differences were found between eye tracking and head movement +modes. Gaming skills did not significantly influence task performance using any +interaction mode. The TMT-VR demonstrates high usability, acceptability, and +user experience among participants. The findings suggest that VR-based +assessments can effectively measure cognitive performance without being +influenced by prior gaming skills, indicating potential applicability for +diverse populations. + +
+
+ comment: 25 Pages, 7 Figures, 4 Tables +
+
+
+
+
+ + ☆ Transfer Learning in Vocal Education: Technical Evaluation of Limited + Samples Describing Mezzo-soprano + + +
+ Vocal education in the music field is difficult to quantify due to the +individual differences in singers' voices and the different quantitative +criteria of singing techniques. Deep learning has great potential to be applied +in music education due to its efficiency to handle complex data and perform +quantitative analysis. However, accurate evaluations with limited samples over +rare vocal types, such as Mezzo-soprano, requires extensive well-annotated data +support using deep learning models. In order to attain the objective, we +perform transfer learning by employing deep learning models pre-trained on the +ImageNet and Urbansound8k datasets for the improvement on the precision of +vocal technique evaluation. Furthermore, we tackle the problem of the lack of +samples by constructing a dedicated dataset, the Mezzo-soprano Vocal Set (MVS), +for vocal technique assessment. Our experimental results indicate that transfer +learning increases the overall accuracy (OAcc) of all models by an average of +8.3%, with the highest accuracy at 94.2%. We not only provide a novel approach +to evaluating Mezzo-soprano vocal techniques but also introduce a new +quantitative assessment method for music education. + +
+
+
+
+
+ + ☆ DOA-Aware Audio-Visual Self-Supervised Learning for Sound Event + Localization and Detection + + +
+ This paper describes sound event localization and detection (SELD) for +spatial audio recordings captured by firstorder ambisonics (FOA) microphones. +In this task, one may train a deep neural network (DNN) using FOA data +annotated with the classes and directions of arrival (DOAs) of sound events. +However, the performance of this approach is severely bounded by the amount of +annotated data. To overcome this limitation, we propose a novel method of +pretraining the feature extraction part of the DNN in a self-supervised manner. +We use spatial audio-visual recordings abundantly available as virtual reality +contents. Assuming that sound objects are concurrently observed by the FOA +microphones and the omni-directional camera, we jointly train audio and visual +encoders with contrastive learning such that the audio and visual embeddings of +the same recording and DOA are made close. A key feature of our method is that +the DOA-wise audio embeddings are jointly extracted from the raw audio data, +while the DOA-wise visual embeddings are separately extracted from the local +visual crops centered on the corresponding DOA. This encourages the latent +features of the audio encoder to represent both the classes and DOAs of sound +events. The experiment using the DCASE2022 Task 3 dataset of 20 hours shows +non-annotated audio-visual recordings of 100 hours reduced the error score of +SELD from 36.4 pts to 34.9 pts. + +
+
+ comment: Accepted to APSIPA2023 +
+
+
+
+
+ + ♻ ☆ MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video + Understanding NeurIPS 2024 + + +
+ The advent of large vision-language models (LVLMs) has spurred research into +their applications in multi-modal contexts, particularly in video +understanding. Traditional VideoQA benchmarks, despite providing quantitative +metrics, often fail to encompass the full spectrum of video content and +inadequately assess models' temporal comprehension. To address these +limitations, we introduce MMBench-Video, a quantitative benchmark designed to +rigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video +incorporates lengthy videos from YouTube and employs free-form questions, +mirroring practical use cases. The benchmark is meticulously crafted to probe +the models' temporal reasoning skills, with all questions human-annotated +according to a carefully constructed ability taxonomy. We employ GPT-4 for +automated assessment, demonstrating superior accuracy and robustness over +earlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted +comprehensive evaluations that include both proprietary and open-source LVLMs +for images and videos. MMBench-Video stands as a valuable resource for the +research community, facilitating improved evaluation of LVLMs and catalyzing +progress in the field of video understanding. The evalutation code of +MMBench-Video will be integrated into VLMEvalKit: +https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ Feature distribution Adaptation Network for Speech Emotion Recognition + + +
+ In this paper, we propose a novel deep inductive transfer learning framework, +named feature distribution adaptation network, to tackle the challenging +multi-modal speech emotion recognition problem. Our method aims to use deep +transfer learning strategies to align visual and audio feature distributions to +obtain consistent representation of emotion, thereby improving the performance +of speech emotion recognition. In our model, the pre-trained ResNet-34 is +utilized for feature extraction for facial expression images and acoustic Mel +spectrograms, respectively. Then, the cross-attention mechanism is introduced +to model the intrinsic similarity relationships of multi-modal features. +Finally, the multi-modal feature distribution adaptation is performed +efficiently with feed-forward network, which is extended using the local +maximum mean discrepancy loss. Experiments are carried out on two benchmark +datasets, and the results demonstrate that our model can achieve excellent +performance compared with existing ones. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ A Pointer Network-based Approach for Joint Extraction and Detection of + Multi-Label Multi-Class Intents EMNLP 2024 + + +
+ In task-oriented dialogue systems, intent detection is crucial for +interpreting user queries and providing appropriate responses. Existing +research primarily addresses simple queries with a single intent, lacking +effective systems for handling complex queries with multiple intents and +extracting different intent spans. Additionally, there is a notable absence of +multilingual, multi-intent datasets. This study addresses three critical tasks: +extracting multiple intent spans from queries, detecting multiple intents, and +developing a multi-lingual multi-label intent dataset. We introduce a novel +multi-label multi-class intent detection dataset (MLMCID-dataset) curated from +existing benchmark datasets. We also propose a pointer network-based +architecture (MLMCID) to extract intent spans and detect multiple intents with +coarse and fine-grained labels in the form of sextuplets. Comprehensive +analysis demonstrates the superiority of our pointer network-based system over +baseline approaches in terms of accuracy and F1-score across various datasets. + +
+
+ comment: Accepted at EMNLP 2024 Findings (Long Paper) +
+
+
+
+
+ + ☆ Pushing the Performance Envelope of DNN-based Recommendation Systems + Inference on GPUs MICRO + + +
+ Personalized recommendation is a ubiquitous application on the internet, with +many industries and hyperscalers extensively leveraging Deep Learning +Recommendation Models (DLRMs) for their personalization needs (like ad serving +or movie suggestions). With growing model and dataset sizes pushing computation +and memory requirements, GPUs are being increasingly preferred for executing +DLRM inference. However, serving newer DLRMs, while meeting acceptable +latencies, continues to remain challenging, making traditional deployments +increasingly more GPU-hungry, resulting in higher inference serving costs. In +this paper, we show that the embedding stage continues to be the primary +bottleneck in the GPU inference pipeline, leading up to a 3.2x embedding-only +performance slowdown. + To thoroughly grasp the problem, we conduct a detailed microarchitecture +characterization and highlight the presence of low occupancy in the standard +embedding kernels. By leveraging direct compiler optimizations, we achieve +optimal occupancy, pushing the performance by up to 53%. Yet, long memory +latency stalls continue to exist. To tackle this challenge, we propose +specialized plug-and-play-based software prefetching and L2 pinning techniques, +which help in hiding and decreasing the latencies. Further, we propose +combining them, as they complement each other. Experimental evaluations using +A100 GPUs with large models and datasets show that our proposed techniques +improve performance by up to 103% for the embedding stage, and up to 77% for +the overall DLRM inference pipeline. + +
+
+ comment: This work has been accepted in the 57th MICRO + (https://microarch.org/micro57/program/). Please check appendix for details + on reproducing our work including codebase and steps +
+
+
+
+
+ + ☆ ContextIQ: A Multimodal Expert-Based Video Retrieval System for + Contextual Advertising WACV 2025 + + +
+ Contextual advertising serves ads that are aligned to the content that the +user is viewing. The rapid growth of video content on social platforms and +streaming services, along with privacy concerns, has increased the need for +contextual advertising. Placing the right ad in the right context creates a +seamless and pleasant ad viewing experience, resulting in higher audience +engagement and, ultimately, better ad monetization. From a technology +standpoint, effective contextual advertising requires a video retrieval system +capable of understanding complex video content at a very granular level. +Current text-to-video retrieval models based on joint multimodal training +demand large datasets and computational resources, limiting their practicality +and lacking the key functionalities required for ad ecosystem integration. We +introduce ContextIQ, a multimodal expert-based video retrieval system designed +specifically for contextual advertising. ContextIQ utilizes modality-specific +experts-video, audio, transcript (captions), and metadata such as objects, +actions, emotion, etc.-to create semantically rich video representations. We +show that our system, without joint training, achieves better or comparable +results to state-of-the-art models and commercial solutions on multiple +text-to-video retrieval benchmarks. Our ablation studies highlight the benefits +of leveraging multiple modalities for enhanced video retrieval accuracy instead +of using a vision-language model alone. Furthermore, we show how video +retrieval systems such as ContextIQ can be used for contextual advertising in +an ad ecosystem while also addressing concerns related to brand safety and +filtering inappropriate content. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ☆ Synthetic Data Generation with Large Language Models for Personalized + Community Question Answering + + +
+ Personalization in Information Retrieval (IR) is a topic studied by the +research community since a long time. However, there is still a lack of +datasets to conduct large-scale evaluations of personalized IR; this is mainly +due to the fact that collecting and curating high-quality user-related +information requires significant costs and time investment. Furthermore, the +creation of datasets for Personalized IR (PIR) tasks is affected by both +privacy concerns and the need for accurate user-related data, which are often +not publicly available. Recently, researchers have started to explore the use +of Large Language Models (LLMs) to generate synthetic datasets, which is a +possible solution to generate data for low-resource tasks. In this paper, we +investigate the potential of Large Language Models (LLMs) for generating +synthetic documents to train an IR system for a Personalized Community Question +Answering task. To study the effectiveness of IR models fine-tuned on +LLM-generated data, we introduce a new dataset, named Sy-SE-PQA. We build +Sy-SE-PQA based on an existing dataset, SE-PQA, which consists of questions and +answers posted on the popular StackExchange communities. Starting from +questions in SE-PQA, we generate synthetic answers using different prompt +techniques and LLMs. Our findings suggest that LLMs have high potential in +generating data tailored to users' needs. The synthetic data can replace +human-written training data, even if the generated data may contain incorrect +information. + +
+
+ comment: Accepted in WI-IAT '24 +
+
+
+
+
+ + ☆ SimRec: Mitigating the Cold-Start Problem in Sequential Recommendation + by Integrating Item Similarity RecSys 2024 + + +
+ Sequential recommendation systems often struggle to make predictions or take +action when dealing with cold-start items that have limited amount of +interactions. In this work, we propose SimRec - a new approach to mitigate the +cold-start problem in sequential recommendation systems. SimRec addresses this +challenge by leveraging the inherent similarity among items, incorporating item +similarities into the training process through a customized loss function. +Importantly, this enhancement is attained with identical model architecture and +the same amount of trainable parameters, resulting in the same inference time +and requiring minimal additional effort. This novel approach results in a +robust contextual sequential recommendation model capable of effectively +handling rare items, including those that were not explicitly seen during +training, thereby enhancing overall recommendation performance. Rigorous +evaluations against multiple baselines on diverse datasets showcase SimRec's +superiority, particularly in scenarios involving items occurring less than 10 +times in the training data. The experiments reveal an impressive improvement, +with SimRec achieving up to 78% higher HR@10 compared to SASRec. Notably, +SimRec outperforms strong baselines on sparse datasets while delivering on-par +performance on dense datasets. Our code is available at +https://github.com/amazon-science/sequential-recommendation-using-similarity. + +
+
+ comment: ACM RecSys 2024 Workshop on Context-Aware Recommender Systems +
+
+
+
+
+ + ☆ Testing Identity of Distributions under Kolmogorov Distance in + Polylogarithmic Space + + +
+ Suppose we have a sample from a distribution $D$ and we want to test whether +$D = D^*$ for a fixed distribution $D^*$. Specifically, we want to reject with +constant probability, if the distance of $D$ from $D^*$ is $\geq \varepsilon$ +in a given metric. In the case of continuous distributions, this has been +studied thoroughly in the statistics literature. Namely, for the well-studied +Kolmogorov metric a test is known that uses the optimal $O(1/\varepsilon^2)$ +samples. + However, this test naively uses also space $O(1/\varepsilon^2)$, and previous +work improved this to $O(1/\varepsilon)$. In this paper, we show that much less +space suffices -- we give an algorithm that uses space $O(\log^4 +\varepsilon^{-1})$ in the streaming setting while also using an asymptotically +optimal number of samples. This is in contrast with the standard total +variation distance on discrete distributions for which such space reduction is +known to be impossible. Finally, we state 9 related open problems that we hope +will spark interest in this and related problems. + +
+
+
+
+
+ + ☆ Modeling Temporal Positive and Negative Excitation for Sequential + Recommendation + + +
+ Sequential recommendation aims to predict the next item which interests users +via modeling their interest in items over time. Most of the existing works on +sequential recommendation model users' dynamic interest in specific items while +overlooking users' static interest revealed by some static attribute +information of items, e.g., category, or brand. Moreover, existing works often +only consider the positive excitation of a user's historical interactions on +his/her next choice on candidate items while ignoring the commonly existing +negative excitation, resulting in insufficient modeling dynamic interest. The +overlook of static interest and negative excitation will lead to incomplete +interest modeling and thus impede the recommendation performance. To this end, +in this paper, we propose modeling both static interest and negative excitation +for dynamic interest to further improve the recommendation performance. +Accordingly, we design a novel Static-Dynamic Interest Learning (SDIL) +framework featured with a novel Temporal Positive and Negative Excitation +Modeling (TPNE) module for accurate sequential recommendation. TPNE is +specially designed for comprehensively modeling dynamic interest based on +temporal positive and negative excitation learning. Extensive experiments on +three real-world datasets show that SDIL can effectively capture both static +and dynamic interest and outperforms state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Dual Conditional Diffusion Models for Sequential Recommendation + + +
+ Recent advancements in diffusion models have shown promising results in +sequential recommendation (SR). However, current diffusion-based methods still +exhibit two key limitations. First, they implicitly model the diffusion process +for target item embeddings rather than the discrete target item itself, leading +to inconsistency in the recommendation process. Second, existing methods rely +on either implicit or explicit conditional diffusion models, limiting their +ability to fully capture the context of user behavior and leading to less +robust target item embeddings. In this paper, we propose the Dual Conditional +Diffusion Models for Sequential Recommendation (DCRec), introducing a +discrete-to-continuous sequential recommendation diffusion framework. Our +framework introduces a complete Markov chain to model the transition from the +reversed target item representation to the discrete item index, bridging the +discrete and continuous item spaces for diffusion models and ensuring +consistency with the diffusion framework. Building on this framework, we +present the Dual Conditional Diffusion Transformer (DCDT) that incorporates the +implicit conditional and the explicit conditional for diffusion-based SR. +Extensive experiments on public benchmark datasets demonstrate that DCRec +outperforms state-of-the-art methods. + +
+
+
+
+
+ + ☆ Guided Diffusion-based Counterfactual Augmentation for Robust + Session-based Recommendation + + +
+ Session-based recommendation (SR) models aim to recommend top-K items to a +user, based on the user's behaviour during the current session. Several SR +models are proposed in the literature, however,concerns have been raised about +their susceptibility to inherent biases in the training data (observed data) +such as popularity bias. SR models when trained on the biased training data may +encounter performance challenges on out-of-distribution data in real-world +scenarios. One way to mitigate popularity bias is counterfactual data +augmentation. Compared to prior works that rely on generating data using SR +models, we focus on utilizing the capabilities of state-of-the art diffusion +models for generating counterfactual data. We propose a guided diffusion-based +counterfactual augmentation framework for SR. Through a combination of offline +and online experiments on a real-world and simulated dataset, respectively, we +show that our approach performs significantly better than the baseline SR +models and other state-of-the art augmentation frameworks. More importantly, +our framework shows significant improvement on less popular target items, by +achieving up to 20% gain in Recall and 13% gain in CTR on real-world and +simulated datasets,respectively. + +
+
+
+
+
+ + ☆ Application of Audio Fingerprinting Techniques for Real-Time Scalable + Speech Retrieval and Speech Clusterization + + +
+ Audio fingerprinting techniques have seen great advances in recent years, +enabling accurate and fast audio retrieval even in conditions when the queried +audio sample has been highly deteriorated or recorded in noisy conditions. +Expectedly, most of the existing work is centered around music, with popular +music identification services such as Apple's Shazam or Google's Now Playing +designed for individual audio recognition on mobile devices. However, the +spectral content of speech differs from that of music, necessitating +modifications to current audio fingerprinting approaches. This paper offers +fresh insights into adapting existing techniques to address the specialized +challenge of speech retrieval in telecommunications and cloud communications +platforms. The focus is on achieving rapid and accurate audio retrieval in +batch processing instead of facilitating single requests, typically on a +centralized server. Moreover, the paper demonstrates how this approach can be +utilized to support audio clustering based on speech transcripts without +undergoing actual speech-to-text conversion. This optimization enables +significantly faster processing without the need for GPU computing, a +requirement for real-time operation that is typically associated with +state-of-the-art speech-to-text tools. + +
+
+
+
+
+ + ☆ PerSRV: Personalized Sticker Retrieval with Vision-Language Model + + +
+ Instant Messaging is a popular means for daily communication, allowing users +to send text and stickers. As the saying goes, "a picture is worth a thousand +words", so developing an effective sticker retrieval technique is crucial for +enhancing user experience. However, existing sticker retrieval methods rely on +labeled data to interpret stickers, and general-purpose Vision-Language Models +(VLMs) often struggle to capture the unique semantics of stickers. +Additionally, relevant-based sticker retrieval methods lack personalization, +creating a gap between diverse user expectations and retrieval results. To +address these, we propose the Personalized Sticker Retrieval with +Vision-Language Model framework, namely PerSRV, structured into offline +calculations and online processing modules. The online retrieval part follows +the paradigm of relevant recall and personalized ranking, supported by the +offline pre-calculation parts, which are sticker semantic understanding, +utility evaluation and personalization modules. Firstly, for sticker-level +semantic understanding, we supervised fine-tuned LLaVA-1.5-7B to generate +human-like sticker semantics, complemented by textual content extracted from +figures and historical interaction queries. Secondly, we investigate three +crowd-sourcing metrics for sticker utility evaluation. Thirdly, we cluster +style centroids based on users' historical interactions to achieve personal +preference modeling. Finally, we evaluate our proposed PerSRV method on a +public sticker retrieval dataset from WeChat, containing 543,098 candidates and +12,568 interactions. Experimental results show that PerSRV significantly +outperforms existing methods in multi-modal sticker retrieval. Additionally, +our fine-tuned VLM delivers notable improvements in sticker semantic +understandings. + +
+
+
+
+
+ + ☆ A Dual Adaptive Assignment Approach for Robust Graph-Based Clustering + + +
+ Graph clustering is an essential aspect of network analysis that involves +grouping nodes into separate clusters. Recent developments in deep learning +have resulted in advanced deep graph clustering techniques, which have proven +effective in many applications. Nonetheless, these methods often encounter +difficulties when dealing with the complexities of real-world graphs, +particularly in the presence of noisy edges. Additionally, many denoising graph +clustering strategies tend to suffer from lower performance compared to their +non-denoised counterparts, training instability, and challenges in scaling to +large datasets. To tackle these issues, we introduce a new framework called the +Dual Adaptive Assignment Approach for Robust Graph-Based Clustering (RDSA). +RDSA consists of three key components: (i) a node embedding module that +effectively integrates the graph's topological features and node attributes; +(ii) a structure-based soft assignment module that improves graph modularity by +utilizing an affinity matrix for node assignments; and (iii) a node-based soft +assignment module that identifies community landmarks and refines node +assignments to enhance the model's robustness. We assess RDSA on various +real-world datasets, demonstrating its superior performance relative to +existing state-of-the-art methods. Our findings indicate that RDSA provides +robust clustering across different graph types, excelling in clustering +effectiveness and robustness, including adaptability to noise, stability, and +scalability. + +
+
+
+
+
+ + ♻ ☆ Fairness in Ranking under Disparate Uncertainty + + +
+ Ranking is a ubiquitous method for focusing the attention of human evaluators +on a manageable subset of options. Its use as part of human decision-making +processes ranges from surfacing potentially relevant products on an e-commerce +site to prioritizing college applications for human review. While ranking can +make human evaluation more effective by focusing attention on the most +promising options, we argue that it can introduce unfairness if the uncertainty +of the underlying relevance model differs between groups of options. +Unfortunately, such disparity in uncertainty appears widespread, often to the +detriment of minority groups for which relevance estimates can have higher +uncertainty due to a lack of data or appropriate features. To address this +fairness issue, we propose Equal-Opportunity Ranking (EOR) as a new fairness +criterion for ranking and show that it corresponds to a group-wise fair lottery +among the relevant options even in the presence of disparate uncertainty. EOR +optimizes for an even cost burden on all groups, unlike the conventional +Probability Ranking Principle, and is fundamentally different from existing +notions of fairness in rankings, such as demographic parity and proportional +Rooney rule constraints that are motivated by proportional representation +relative to group size. To make EOR ranking practical, we present an efficient +algorithm for computing it in time $O(n \log(n))$ and prove its close +approximation guarantee to the globally optimal solution. In a comprehensive +empirical evaluation on synthetic data, a US Census dataset, and a real-world +audit of Amazon search queries, we find that the algorithm reliably guarantees +EOR fairness while providing effective rankings. + +
+
+ comment: Camera ready version at EAAMO'24 +
+
+
+
+
+ + ♻ ☆ Knowledge in Triples for LLMs: Enhancing Table QA Accuracy with Semantic + Extraction + + +
+ Integrating structured knowledge from tabular formats poses significant +challenges within natural language processing (NLP), mainly when dealing with +complex, semi-structured tables like those found in the FeTaQA dataset. These +tables require advanced methods to interpret and generate meaningful responses +accurately. Traditional approaches, such as SQL and SPARQL, often fail to fully +capture the semantics of such data, especially in the presence of irregular +table structures like web tables. This paper addresses these challenges by +proposing a novel approach that extracts triples straightforward from tabular +data and integrates it with a retrieval-augmented generation (RAG) model to +enhance the accuracy, coherence, and contextual richness of responses generated +by a fine-tuned GPT-3.5-turbo-0125 model. Our approach significantly +outperforms existing baselines on the FeTaQA dataset, particularly excelling in +Sacre-BLEU and ROUGE metrics. It effectively generates contextually accurate +and detailed long-form answers from tables, showcasing its strength in complex +data interpretation. + +
+
+ comment: We are withdrawing this paper to address foundational aspects that + are critical for ensuring its accuracy and integrity before any potential + resubmission +
+
+
+
+
+ + ♻ ☆ Context Embeddings for Efficient Answer Generation in RAG + + +
+ Retrieval-Augmented Generation (RAG) allows overcoming the limited knowledge +of LLMs by extending the input with external information. As a consequence, the +contextual inputs to the model become much longer which slows down decoding +time directly translating to the time a user has to wait for an answer. We +address this challenge by presenting COCOM, an effective context compression +method, reducing long contexts to only a handful of Context Embeddings speeding +up the generation time by a large margin. Our method allows for different +compression rates trading off decoding time for answer quality. Compared to +earlier methods, COCOM allows for handling multiple contexts more effectively, +significantly reducing decoding time for long inputs. Our method demonstrates a +speed-up of up to 5.69 $\times$ while achieving higher performance compared to +existing efficient context compression methods. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Agentic Information Retrieval + + +
+ What will information entry look like in the next generation of digital +products? Since the 1970s, user access to relevant information has relied on +domain-specific architectures of information retrieval (IR). Over the past two +decades, the advent of modern IR systems, including web search engines and +personalized recommender systems, has greatly improved the efficiency of +retrieving relevant information from vast data corpora. However, the core +paradigm of these IR systems remains largely unchanged, relying on filtering a +predefined set of candidate items. Since 2022, breakthroughs in large language +models (LLMs) have begun transforming how information is accessed, establishing +a new technical paradigm. In this position paper, we introduce Agentic +Information Retrieval (Agentic IR), a novel IR paradigm shaped by the +capabilities of LLM agents. Agentic IR expands the scope of accessible tasks +and leverages a suite of new techniques to redefine information retrieval. We +discuss three types of cutting-edge applications of agentic IR and the +challenges faced. We propose that agentic IR holds promise for generating +innovative applications, potentially becoming a central information entry point +in future digital ecosystems. + +
+
+ comment: 11 pages, position paper +
+
+
+
+
+ + ♻ ☆ Building a Scalable, Effective, and Steerable Search and Ranking + Platform + + +
+ Modern e-commerce platforms offer vast product selections, making it +difficult for customers to find items that they like and that are relevant to +their current session intent. This is why it is key for e-commerce platforms to +have near real-time scalable and adaptable personalized ranking and search +systems. While numerous methods exist in the scientific literature for building +such systems, many are unsuitable for large-scale industrial use due to +complexity and performance limitations. Consequently, industrial ranking +systems often resort to computationally efficient yet simplistic retrieval or +candidate generation approaches, which overlook near real-time and +heterogeneous customer signals, which results in a less personalized and +relevant experience. Moreover, related customer experiences are served by +completely different systems, which increases complexity, maintenance, and +inconsistent experiences. + In this paper, we present a personalized, adaptable near real-time ranking +platform that is reusable across various use cases, such as browsing and +search, and that is able to cater to millions of items and customers under +heavy load (thousands of requests per second). We employ transformer-based +models through different ranking layers which can learn complex behavior +patterns directly from customer action sequences while being able to +incorporate temporal (e.g. in-session) and contextual information. We validate +our system through a series of comprehensive offline and online real-world +experiments at a large online e-commerce platform, and we demonstrate its +superiority when compared to existing systems, both in terms of customer +experience as well as in net revenue. Finally, we share the lessons learned +from building a comprehensive, modern ranking platform for use in a large-scale +e-commerce environment. + +
+
+
+
+
+ + ♻ ☆ USimAgent: Large Language Models for Simulating Search Users + + +
+ Due to the advantages in the cost-efficiency and reproducibility, user +simulation has become a promising solution to the user-centric evaluation of +information retrieval systems. Nonetheless, accurately simulating user search +behaviors has long been a challenge, because users' actions in search are +highly complex and driven by intricate cognitive processes such as learning, +reasoning, and planning. Recently, Large Language Models (LLMs) have +demonstrated remarked potential in simulating human-level intelligence and have +been used in building autonomous agents for various tasks. However, the +potential of using LLMs in simulating search behaviors has not yet been fully +explored. In this paper, we introduce a LLM-based user search behavior +simulator, USimAgent. The proposed simulator can simulate users' querying, +clicking, and stopping behaviors during search, and thus, is capable of +generating complete search sessions for specific search tasks. Empirical +investigation on a real user behavior dataset shows that the proposed simulator +outperforms existing methods in query generation and is comparable to +traditional methods in predicting user clicks and stopping behaviors. These +results not only validate the effectiveness of using LLMs for user simulation +but also shed light on the development of a more robust and generic user +simulators. The code and data are accessible at +https://github.com/Meow-E/USimAgent. + +
+
+
+
+
+ + ♻ ☆ Evaluating Performance and Bias of Negative Sampling in Large-Scale + Sequential Recommendation Models RecSys + + +
+ Large-scale industrial recommendation models predict the most relevant items +from catalogs containing millions or billions of options. To train these models +efficiently, a small set of irrelevant items (negative samples) is selected +from the vast catalog for each relevant item (positive example), helping the +model distinguish between relevant and irrelevant items. Choosing the right +negative sampling method is a common challenge. We address this by implementing +and comparing various negative sampling methods - random, popularity-based, +in-batch, mixed, adaptive, and adaptive with mixed variants - on modern +sequential recommendation models. Our experiments, including hyperparameter +optimization and 20x repeats on three benchmark datasets with varying +popularity biases, show how the choice of method and dataset characteristics +impact key model performance metrics. We also reveal that average performance +metrics often hide imbalances across popularity bands (head, mid, tail). We +find that commonly used random negative sampling reinforces popularity bias and +performs best for head items. Popularity-based methods (in-batch and global +popularity negative sampling) can offer balanced performance at the cost of +lower overall model performance results. Our study serves as a practical guide +to the trade-offs in selecting a negative sampling method for large-scale +sequential recommendation models. Code, datasets, experimental results and +hyperparameters are available at: +https://github.com/apple/ml-negative-sampling. + +
+
+ comment: Workshop for Large Recommender Systems (LargeRecSys), 18th ACM + Conference on Recommender Systems, 2024, Bari, Italy +
+
+
+
+
+ + ♻ ☆ Deep Group Interest Modeling of Full Lifelong User Behaviors for CTR + Prediction + + +
+ Extracting users' interests from their lifelong behavior sequence is crucial +for predicting Click-Through Rate (CTR). Most current methods employ a +two-stage process for efficiency: they first select historical behaviors +related to the candidate item and then deduce the user's interest from this +narrowed-down behavior sub-sequence. This two-stage paradigm, though effective, +leads to information loss. Solely using users' lifelong click behaviors doesn't +provide a complete picture of their interests, leading to suboptimal +performance. In our research, we introduce the Deep Group Interest Network +(DGIN), an end-to-end method to model the user's entire behavior history. This +includes all post-registration actions, such as clicks, cart additions, +purchases, and more, providing a nuanced user understanding. We start by +grouping the full range of behaviors using a relevant key (like item_id) to +enhance efficiency. This process reduces the behavior length significantly, +from O(10^4) to O(10^2). To mitigate the potential loss of information due to +grouping, we incorporate two categories of group attributes. Within each group, +we calculate statistical information on various heterogeneous behaviors (like +behavior counts) and employ self-attention mechanisms to highlight unique +behavior characteristics (like behavior type). Based on this reorganized +behavior data, the user's interests are derived using the Transformer +technique. Additionally, we identify a subset of behaviors that share the same +item_id with the candidate item from the lifelong behavior sequence. The +insights from this subset reveal the user's decision-making process related to +the candidate item, improving prediction accuracy. Our comprehensive +evaluation, both on industrial and public datasets, validates DGIN's efficacy +and efficiency. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Multimodal Semantic Communication for Generative Audio-Driven Video + Conferencing + + +
+ This paper studies an efficient multimodal data communication scheme for +video conferencing. In our considered system, a speaker gives a talk to the +audiences, with talking head video and audio being transmitted. Since the +speaker does not frequently change posture and high-fidelity transmission of +audio (speech and music) is required, redundant visual video data exists and +can be removed by generating the video from the audio. To this end, we propose +a wave-to-video (Wav2Vid) system, an efficient video transmission framework +that reduces transmitted data by generating talking head video from audio. In +particular, full-duration audio and short-duration video data are synchronously +transmitted through a wireless channel, with neural networks (NNs) extracting +and encoding audio and video semantics. The receiver then combines the decoded +audio and video data, as well as uses a generative adversarial network (GAN) +based model to generate the lip movement videos of the speaker. Simulation +results show that the proposed Wav2Vid system can reduce the amount of +transmitted data by up to 83% while maintaining the perceptual quality of the +generated conferencing video. + +
+
+ comment: accepted by IEEE Wireless Communications Letters +
+
+
+
+
+ + ☆ CHORDONOMICON: A Dataset of 666,000 Songs and their Chord Progressions + + +
+ Chord progressions encapsulate important information about music, pertaining +to its structure and conveyed emotions. They serve as the backbone of musical +composition, and in many cases, they are the sole information required for a +musician to play along and follow the music. Despite their importance, chord +progressions as a data domain remain underexplored. There is a lack of +large-scale datasets suitable for deep learning applications, and limited +research exploring chord progressions as an input modality. In this work, we +present Chordonomicon, a dataset of over 666,000 songs and their chord +progressions, annotated with structural parts, genre, and release date - +created by scraping various sources of user-generated progressions and +associated metadata. We demonstrate the practical utility of the Chordonomicon +dataset for classification and generation tasks, and discuss its potential to +provide valuable insights to the research community. Chord progressions are +unique in their ability to be represented in multiple formats (e.g. text, +graph) and the wealth of information chords convey in given contexts, such as +their harmonic function . These characteristics make the Chordonomicon an ideal +testbed for exploring advanced machine learning techniques, including +transformers, graph machine learning, and hybrid systems that combine knowledge +representation and machine learning. + +
+
+
+
+
+ + ♻ ☆ Enhancing Learned Image Compression via Cross Window-based Attention + + +
+ In recent years, learned image compression methods have demonstrated superior +rate-distortion performance compared to traditional image compression methods. +Recent methods utilize convolutional neural networks (CNN), variational +autoencoders (VAE), invertible neural networks (INN), and transformers. Despite +their significant contributions, a main drawback of these models is their poor +performance in capturing local redundancy. Therefore, to leverage global +features along with local redundancy, we propose a CNN-based solution +integrated with a feature encoding module. The feature encoding module encodes +important features before feeding them to the CNN and then utilizes cross-scale +window-based attention, which further captures local redundancy. Cross-scale +window-based attention is inspired by the attention mechanism in transformers +and effectively enlarges the receptive field. Both the feature encoding module +and the cross-scale window-based attention module in our architecture are +flexible and can be incorporated into any other network architecture. We +evaluate our method on the Kodak and CLIC datasets and demonstrate that our +approach is effective and on par with state-of-the-art methods. + +
+
+ comment: Paper accepted and presented in ISVC'24. Copyrights stay with ISVC +
+
+
+
+
+ + ♻ ☆ Structured Multi-Track Accompaniment Arrangement via Style Prior + Modelling NeurIPS 2024 + + +
+ In the realm of music AI, arranging rich and structured multi-track +accompaniments from a simple lead sheet presents significant challenges. Such +challenges include maintaining track cohesion, ensuring long-term coherence, +and optimizing computational efficiency. In this paper, we introduce a novel +system that leverages prior modelling over disentangled style factors to +address these challenges. Our method presents a two-stage process: initially, a +piano arrangement is derived from the lead sheet by retrieving piano texture +styles; subsequently, a multi-track orchestration is generated by infusing +orchestral function styles into the piano arrangement. Our key design is the +use of vector quantization and a unique multi-stream Transformer to model the +long-term flow of the orchestration style, which enables flexible, +controllable, and structured music generation. Experiments show that by +factorizing the arrangement task into interpretable sub-stages, our approach +enhances generative capacity while improving efficiency. Additionally, our +system supports a variety of music genres and provides style control at +different composition hierarchies. We further show that our system achieves +superior coherence, structure, and overall arrangement quality compared to +existing baselines. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Unveiling Encoder-Free Vision-Language Models NeurIPS2024 + + +
+ Existing vision-language models (VLMs) mostly rely on vision encoders to +extract visual features followed by large language models (LLMs) for +visual-language tasks. However, the vision encoders set a strong inductive bias +in abstracting visual representation, e.g., resolution, aspect ratio, and +semantic priors, which could impede the flexibility and efficiency of the VLMs. +Training pure VLMs that accept the seamless vision and language inputs, i.e., +without vision encoders, remains challenging and rarely explored. Empirical +observations reveal that direct training without encoders results in slow +convergence and large performance gaps. In this work, we bridge the gap between +encoder-based and encoder-free models, and present a simple yet effective +training recipe towards pure VLMs. Specifically, we unveil the key aspects of +training encoder-free VLMs efficiently via thorough experiments: (1) Bridging +vision-language representation inside one unified decoder; (2) Enhancing visual +recognition capability via extra supervision. With these strategies, we launch +EVE, an encoder-free vision-language model that can be trained and forwarded +efficiently. Notably, solely utilizing 35M publicly accessible data, EVE can +impressively rival the encoder-based VLMs of similar capacities across multiple +vision-language benchmarks. It significantly outperforms the counterpart +Fuyu-8B with mysterious training procedures and undisclosed training data. We +believe that EVE provides a transparent and efficient route for developing a +pure decoder-only architecture across modalities. Our code and models are +publicly available at: https://github.com/baaivision/EVE. + +
+
+ comment: 17 pages, 8 figures, Accepted by NeurIPS2024 (spotlight) +
+
+
+
+
+ + ♻ ☆ Document Parsing Unveiled: Techniques, Challenges, and Prospects for + Structured Information Extraction + + +
+ Document parsing is essential for converting unstructured and semi-structured +documents-such as contracts, academic papers, and invoices-into structured, +machine-readable data. Document parsing extract reliable structured data from +unstructured inputs, providing huge convenience for numerous applications. +Especially with recent achievements in Large Language Models, document parsing +plays an indispensable role in both knowledge base construction and training +data generation. This survey presents a comprehensive review of the current +state of document parsing, covering key methodologies, from modular pipeline +systems to end-to-end models driven by large vision-language models. Core +components such as layout detection, content extraction (including text, +tables, and mathematical expressions), and multi-modal data integration are +examined in detail. Additionally, this paper discusses the challenges faced by +modular document parsing systems and vision-language models in handling complex +layouts, integrating multiple modules, and recognizing high-density text. It +emphasizes the importance of developing larger and more diverse datasets and +outlines future research directions. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 24 + +
+
+
+ + ☆ Semantic Search Evaluation CIKM 2024 + + +
+ We propose a novel method for evaluating the performance of a content search +system that measures the semantic match between a query and the results +returned by the search system. We introduce a metric called "on-topic rate" to +measure the percentage of results that are relevant to the query. To achieve +this, we design a pipeline that defines a golden query set, retrieves the top K +results for each query, and sends calls to GPT 3.5 with formulated prompts. Our +semantic evaluation pipeline helps identify common failure patterns and goals +against the metric for relevance improvements. + +
+
+ comment: Accepted by 3rd International Workshop on Industrial Recommendation + Systems (at CIKM 2024) +
+
+
+
+
+ + ☆ Can Users Detect Biases or Factual Errors in Generated Responses in + Conversational Information-Seeking? SIGIR + + +
+ Information-seeking dialogues span a wide range of questions, from simple +factoid to complex queries that require exploring multiple facets and +viewpoints. When performing exploratory searches in unfamiliar domains, users +may lack background knowledge and struggle to verify the system-provided +information, making them vulnerable to misinformation. We investigate the +limitations of response generation in conversational information-seeking +systems, highlighting potential inaccuracies, pitfalls, and biases in the +responses. The study addresses the problem of query answerability and the +challenge of response incompleteness. Our user studies explore how these issues +impact user experience, focusing on users' ability to identify biased, +incorrect, or incomplete responses. We design two crowdsourcing tasks to assess +user experience with different system response variants, highlighting critical +issues to be addressed in future conversational information-seeking research. +Our analysis reveals that it is easier for users to detect response +incompleteness than query answerability and user satisfaction is mostly +associated with response diversity, not factual correctness. + +
+
+ comment: Extended version of the paper that appeared in the Proceedings of the + 2024 Annual International ACM SIGIR Conference on Research and Development in + Information Retrieval in the Asia Pacific Region (SIGIR-AP '24) +
+
+
+
+
+ + ☆ Enhancing CTR Prediction in Recommendation Domain with Search Query + Representation CIKM 2024 + + +
+ Many platforms, such as e-commerce websites, offer both search and +recommendation services simultaneously to better meet users' diverse needs. +Recommendation services suggest items based on user preferences, while search +services allow users to search for items before providing recommendations. +Since users and items are often shared between the search and recommendation +domains, there is a valuable opportunity to enhance the recommendation domain +by leveraging user preferences extracted from the search domain. Existing +approaches either overlook the shift in user intention between these domains or +fail to capture the significant impact of learning from users' search queries +on understanding their interests. + In this paper, we propose a framework that learns from user search query +embeddings within the context of user preferences in the recommendation domain. +Specifically, user search query sequences from the search domain are used to +predict the items users will click at the next time point in the recommendation +domain. Additionally, the relationship between queries and items is explored +through contrastive learning. To address issues of data sparsity, the diffusion +model is incorporated to infer positive items the user will select after +searching with certain queries in a denoising manner, which is particularly +effective in preventing false positives. Effectively extracting this +information, the queries are integrated into click-through rate prediction in +the recommendation domain. Experimental analysis demonstrates that our model +outperforms state-of-the-art models in the recommendation domain. + +
+
+ comment: Accepted by CIKM 2024 Full Research Track +
+
+
+
+
+ + ☆ A Systematic Review of Machine Learning in Sports Betting: Techniques, + Challenges, and Future Directions + + +
+ The sports betting industry has experienced rapid growth, driven largely by +technological advancements and the proliferation of online platforms. Machine +learning (ML) has played a pivotal role in the transformation of this sector by +enabling more accurate predictions, dynamic odds-setting, and enhanced risk +management for both bookmakers and bettors. This systematic review explores +various ML techniques, including support vector machines, random forests, and +neural networks, as applied in different sports such as soccer, basketball, +tennis, and cricket. These models utilize historical data, in-game statistics, +and real-time information to optimize betting strategies and identify value +bets, ultimately improving profitability. For bookmakers, ML facilitates +dynamic odds adjustment and effective risk management, while bettors leverage +data-driven insights to exploit market inefficiencies. This review also +underscores the role of ML in fraud detection, where anomaly detection models +are used to identify suspicious betting patterns. Despite these advancements, +challenges such as data quality, real-time decision-making, and the inherent +unpredictability of sports outcomes remain. Ethical concerns related to +transparency and fairness are also of significant importance. Future research +should focus on developing adaptive models that integrate multimodal data and +manage risk in a manner akin to financial portfolios. This review provides a +comprehensive examination of the current applications of ML in sports betting, +and highlights both the potential and the limitations of these technologies. + +
+
+
+
+
+ + ☆ Zero-Shot Dense Retrieval with Embeddings from Relevance Feedback + + +
+ Building effective dense retrieval systems remains difficult when relevance +supervision is not available. Recent work has looked to overcome this challenge +by using a Large Language Model (LLM) to generate hypothetical documents that +can be used to find the closest real document. However, this approach relies +solely on the LLM to have domain-specific knowledge relevant to the query, +which may not be practical. Furthermore, generating hypothetical documents can +be inefficient as it requires the LLM to generate a large number of tokens for +each query. To address these challenges, we introduce Real Document Embeddings +from Relevance Feedback (ReDE-RF). Inspired by relevance feedback, ReDE-RF +proposes to re-frame hypothetical document generation as a relevance estimation +task, using an LLM to select which documents should be used for nearest +neighbor search. Through this re-framing, the LLM no longer needs +domain-specific knowledge but only needs to judge what is relevant. +Additionally, relevance estimation only requires the LLM to output a single +token, thereby improving search latency. Our experiments show that ReDE-RF +consistently surpasses state-of-the-art zero-shot dense retrieval methods +across a wide range of low-resource retrieval datasets while also making +significant improvements in latency per-query. + +
+
+
+
+
+ + ☆ Vision Search Assistant: Empower Vision-Language Models as Multimodal + Search Engines + + +
+ Search engines enable the retrieval of unknown information with texts. +However, traditional methods fall short when it comes to understanding +unfamiliar visual content, such as identifying an object that the model has +never seen before. This challenge is particularly pronounced for large +vision-language models (VLMs): if the model has not been exposed to the object +depicted in an image, it struggles to generate reliable answers to the user's +question regarding that image. Moreover, as new objects and events continuously +emerge, frequently updating VLMs is impractical due to heavy computational +burdens. To address this limitation, we propose Vision Search Assistant, a +novel framework that facilitates collaboration between VLMs and web agents. +This approach leverages VLMs' visual understanding capabilities and web agents' +real-time information access to perform open-world Retrieval-Augmented +Generation via the web. By integrating visual and textual representations +through this collaboration, the model can provide informed responses even when +the image is novel to the system. Extensive experiments conducted on both +open-set and closed-set QA benchmarks demonstrate that the Vision Search +Assistant significantly outperforms the other models and can be widely applied +to existing VLMs. + +
+
+ comment: Code is available at https://github.com/cnzzx/VSA +
+
+
+
+
+ + ☆ Pay Attention to Attention for Sequential Recommendation RecSys 2024 + + +
+ Transformer-based approaches have demonstrated remarkable success in various +sequence-based tasks. However, traditional self-attention models may not +sufficiently capture the intricate dependencies within items in sequential +recommendation scenarios. This is due to the lack of explicit emphasis on +attention weights, which play a critical role in allocating attention and +understanding item-to-item correlations. To better exploit the potential of +attention weights and improve the capability of sequential recommendation in +learning high-order dependencies, we propose a novel sequential recommendation +(SR) approach called attention weight refinement (AWRSR). AWRSR enhances the +effectiveness of self-attention by additionally paying attention to attention +weights, allowing for more refined attention distributions of correlations +among items. We conduct comprehensive experiments on multiple real-world +datasets, demonstrating that our approach consistently outperforms +state-of-the-art SR models. Moreover, we provide a thorough analysis of AWRSR's +effectiveness in capturing higher-level dependencies. These findings suggest +that AWRSR offers a promising new direction for enhancing the performance of +self-attention architecture in SR tasks, with potential applications in other +sequence-based problems as well. + +
+
+ comment: Accepted at RecSys 2024 +
+
+
+
+
+ + ☆ Simultaneous Unlearning of Multiple Protected User Attributes From + Variational Autoencoder Recommenders Using Adversarial Training + + +
+ In widely used neural network-based collaborative filtering models, users' +history logs are encoded into latent embeddings that represent the users' +preferences. In this setting, the models are capable of mapping users' +protected attributes (e.g., gender or ethnicity) from these user embeddings +even without explicit access to them, resulting in models that may treat +specific demographic user groups unfairly and raise privacy issues. While prior +work has approached the removal of a single protected attribute of a user at a +time, multiple attributes might come into play in real-world scenarios. In the +work at hand, we present AdvXMultVAE which aims to unlearn multiple protected +attributes (exemplified by gender and age) simultaneously to improve fairness +across demographic user groups. For this purpose, we couple a variational +autoencoder (VAE) architecture with adversarial training (AdvMultVAE) to +support simultaneous removal of the users' protected attributes with continuous +and/or categorical values. Our experiments on two datasets, LFM-2b-100k and +Ml-1m, from the music and movie domains, respectively, show that our approach +can yield better results than its singular removal counterparts (based on +AdvMultVAE) in effectively mitigating demographic biases whilst improving the +anonymity of latent embeddings. + +
+
+
+
+
+ + ☆ Challenges in Implementing a Recommender System for Historical Research + in the Humanities RecSys 2024 + + +
+ This extended abstract describes the challenges in implementing recommender +systems for digital archives in the humanities, focusing on Monasterium.net, a +platform for historical legal documents. We discuss three key aspects: (i) the +unique characteristics of so-called charters as items for recommendation, (ii) +the complex multi-stakeholder environment, and (iii) the distinct +information-seeking behavior of scholars in the humanities. By examining these +factors, we aim to contribute to the development of more effective and tailored +recommender systems for (digital) humanities research. + +
+
+ comment: Presented at AltRecSys 2024: The First Workshop on Alternative, + Unexpected, and Critical Ideas in Recommendation, October 18, 2024, + co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024), + Bari, Italy +
+
+
+
+
+ + ☆ RecFlow: An Industrial Full Flow Recommendation Dataset + + +
+ Industrial recommendation systems (RS) rely on the multi-stage pipeline to +balance effectiveness and efficiency when delivering items from a vast corpus +to users. Existing RS benchmark datasets primarily focus on the exposure space, +where novel RS algorithms are trained and evaluated. However, when these +algorithms transition to real world industrial RS, they face a critical +challenge of handling unexposed items which are a significantly larger space +than the exposed one. This discrepancy profoundly impacts their practical +performance. Additionally, these algorithms often overlook the intricate +interplay between multiple RS stages, resulting in suboptimal overall system +performance. To address this issue, we introduce RecFlow, an industrial full +flow recommendation dataset designed to bridge the gap between offline RS +benchmarks and the real online environment. Unlike existing datasets, RecFlow +includes samples not only from the exposure space but also unexposed items +filtered at each stage of the RS funnel. Our dataset comprises 38M interactions +from 42K users across nearly 9M items with additional 1.9B stage samples +collected from 9.3M online requests over 37 days and spanning 6 stages. +Leveraging the RecFlow dataset, we conduct courageous exploration experiments, +showcasing its potential in designing new algorithms to enhance effectiveness +by incorporating stage-specific samples. Some of these algorithms have already +been deployed online, consistently yielding significant gains. We propose +RecFlow as the first comprehensive benchmark dataset for the RS community, +supporting research on designing algorithms at any stage, study of selection +bias, debiased algorithms, multi-stage consistency and optimality, multi-task +recommendation, and user behavior modeling. The RecFlow dataset, along with the +corresponding source code, is available at +https://github.com/RecFlow-ICLR/RecFlow. + +
+
+
+
+
+ + ☆ Leveraging AI and Sentiment Analysis for Forecasting Election Outcomes + in Mauritius + + +
+ This study explores the use of AI-driven sentiment analysis as a novel tool +for forecasting election outcomes, focusing on Mauritius' 2024 elections. In +the absence of reliable polling data, we analyze media sentiment toward two +main political parties L'Alliance Lepep and L'Alliance Du Changement by +classifying news articles from prominent Mauritian media outlets as positive, +negative, or neutral. We employ a multilingual BERT-based model and a custom +Sentiment Scoring Algorithm to quantify sentiment dynamics and apply the +Sentiment Impact Score (SIS) for measuring sentiment influence over time. Our +forecast model suggests L'Alliance Du Changement is likely to secure a minimum +of 37 seats, while L'Alliance Lepep is predicted to obtain the remaining 23 +seats out of the 60 available. Findings indicate that positive media sentiment +strongly correlates with projected electoral gains, underscoring the role of +media in shaping public perception. This approach not only mitigates media bias +through adjusted scoring but also serves as a reliable alternative to +traditional polling. The study offers a scalable methodology for political +forecasting in regions with limited polling infrastructure and contributes to +advancements in the field of political data science. + +
+
+
+
+
+ + ☆ Temporal Streaming Batch Principal Component Analysis for Time Series + Classification + + +
+ In multivariate time series classification, although current sequence +analysis models have excellent classification capabilities, they show +significant shortcomings when dealing with long sequence multivariate data, +such as prolonged training times and decreased accuracy. This paper focuses on +optimizing model performance for long-sequence multivariate data by mitigating +the impact of extended time series and multiple variables on the model. We +propose a principal component analysis (PCA)-based temporal streaming +compression and dimensionality reduction algorithm for time series data +(temporal streaming batch PCA, TSBPCA), which continuously updates the compact +representation of the entire sequence through streaming PCA time estimation +with time block updates, enhancing the data representation capability of a +range of sequence analysis models. We evaluated this method using various +models on five real datasets, and the experimental results show that our method +performs well in terms of classification accuracy and time efficiency. Notably, +our method demonstrates a trend of increasing effectiveness as sequence length +grows; on the two longest sequence datasets, accuracy improved by about 7.2%, +and execution time decreased by 49.5%. + +
+
+
+
+
+ + ☆ Beyond Positive History: Re-ranking with List-level Hybrid Feedback + + +
+ As the last stage of recommender systems, re-ranking generates a re-ordered +list that aligns with the user's preference. However, previous works generally +focus on item-level positive feedback as history (e.g., only clicked items) and +ignore that users provide positive or negative feedback on items in the entire +list. This list-level hybrid feedback can reveal users' holistic preferences +and reflect users' comparison behavior patterns manifesting within a list. Such +patterns could predict user behaviors on candidate lists, thus aiding better +re-ranking. Despite appealing benefits, extracting and integrating preferences +and behavior patterns from list-level hybrid feedback into re-ranking multiple +items remains challenging. To this end, we propose Re-ranking with List-level +Hybrid Feedback (dubbed RELIFE). It captures user's preferences and behavior +patterns with three modules: a Disentangled Interest Miner to disentangle the +user's preferences into interests and disinterests, a Sequential Preference +Mixer to learn users' entangled preferences considering the context of +feedback, and a Comparison-aware Pattern Extractor to capture user's behavior +patterns within each list. Moreover, for better integration of patterns, +contrastive learning is adopted to align the behavior patterns of candidate and +historical lists. Extensive experiments show that RELIFE significantly +outperforms SOTA re-ranking baselines. + +
+
+
+
+
+ + ☆ GPRec: Bi-level User Modeling for Deep Recommenders + + +
+ GPRec explicitly categorizes users into groups in a learnable manner and +aligns them with corresponding group embeddings. We design the dual group +embedding space to offer a diverse perspective on group preferences by +contrasting positive and negative patterns. On the individual level, GPRec +identifies personal preferences from ID-like features and refines the obtained +individual representations to be independent of group ones, thereby providing a +robust complement to the group-level modeling. We also present various +strategies for the flexible integration of GPRec into various DRS models. +Rigorous testing of GPRec on three public datasets has demonstrated significant +improvements in recommendation quality. + +
+
+
+
+
+ + ☆ GenUP: Generative User Profilers as In-Context Learners for Next POI + Recommender Systems + + +
+ Traditional POI recommendation systems often lack transparency, +interpretability, and scrutability due to their reliance on dense vector-based +user embeddings. Furthermore, the cold-start problem -- where systems have +insufficient data for new users -- limits their ability to generate accurate +recommendations. Existing methods often address this by leveraging similar +trajectories from other users, but this approach can be computationally +expensive and increases the context length for LLM-based methods, making them +difficult to scale. To address these limitations, we propose a method that +generates natural language (NL) user profiles from large-scale, location-based +social network (LBSN) check-ins, utilizing robust personality assessments and +behavioral theories. These NL profiles capture user preferences, routines, and +behaviors, improving POI prediction accuracy while offering enhanced +transparency. By incorporating NL profiles as system prompts to LLMs, our +approach reduces reliance on extensive historical data, while remaining +flexible, easily updated, and computationally efficient. Our method is not only +competitive with other LLM-based and complex agentic frameworks but is also +more scalable for real-world scenarios and on-device POI recommendations. +Results demonstrate that our approach consistently outperforms baseline +methods, offering a more interpretable and resource-efficient solution for POI +recommendation systems. Our source code is available at: +\url{https://github.com/w11wo/GenUP}. + +
+
+
+
+
+ + ☆ Collaborative Knowledge Fusion: A Novel Approach for Multi-task + Recommender Systems via LLMs + + +
+ Owing to the impressive general intelligence of large language models (LLMs), +there has been a growing trend to integrate them into recommender systems to +gain a more profound insight into human interests and intentions. Existing +LLMs-based recommender systems primarily leverage item attributes and user +interaction histories in textual format, improving the single task like rating +prediction or explainable recommendation. Nevertheless, these approaches +overlook the crucial contribution of traditional collaborative signals in +discerning users' profound intentions and disregard the interrelatedness among +tasks. To address these limitations, we introduce a novel framework known as +CKF, specifically developed to boost multi-task recommendations via +personalized collaborative knowledge fusion into LLMs. Specifically, our method +synergizes traditional collaborative filtering models to produce collaborative +embeddings, subsequently employing the meta-network to construct personalized +mapping bridges tailored for each user. Upon mapped, the embeddings are +incorporated into meticulously designed prompt templates and then fed into an +advanced LLM to represent user interests. To investigate the intrinsic +relationship among diverse recommendation tasks, we develop Multi-Lora, a new +parameter-efficient approach for multi-task optimization, adept at distinctly +segregating task-shared and task-specific information. This method forges a +connection between LLMs and recommendation scenarios, while simultaneously +enriching the supervisory signal through mutual knowledge transfer among +various tasks. Extensive experiments and in-depth robustness analyses across +four common recommendation tasks on four large public data sets substantiate +the effectiveness and superiority of our framework. + +
+
+
+
+
+ + ♻ ☆ How Does Message Passing Improve Collaborative Filtering? NeurIPS'24 + + +
+ Collaborative filtering (CF) has exhibited prominent results for recommender +systems and been broadly utilized for real-world applications. A branch of +research enhances CF methods by message passing used in graph neural networks, +due to its strong capabilities of extracting knowledge from graph-structured +data, like user-item bipartite graphs that naturally exist in CF. They assume +that message passing helps CF methods in a manner akin to its benefits for +graph-based learning tasks in general. However, even though message passing +empirically improves CF, whether or not this assumption is correct still needs +verification. To address this gap, we formally investigate why message passing +helps CF from multiple perspectives and show that many assumptions made by +previous works are not entirely accurate. With our curated ablation studies and +theoretical analyses, we discover that (1) message passing improves the CF +performance primarily by additional representations passed from neighbors +during the forward pass instead of additional gradient updates to neighbor +representations during the model back-propagation and (ii) message passing +usually helps low-degree nodes more than high-degree nodes. Utilizing these +novel findings, we present Test-time Aggregation for CF, namely TAG-CF, a +test-time augmentation framework that only conducts message passing once at +inference time. The key novelty of TAG-CF is that it effectively utilizes graph +knowledge while circumventing most of notorious computational overheads of +message passing. Besides, TAG-CF is extremely versatile can be used as a +plug-and-play module to enhance representations trained by different CF +supervision signals. Evaluated on six datasets, TAG-CF consistently improves +the recommendation performance of CF methods without graph by up to 39.2% on +cold users and 31.7% on all users, with little to no extra computational +overheads. + +
+
+ comment: Accepted to NeurIPS'24. Code available at: + https://github.com/snap-research/Test-time-Aggregation-for-CF +
+
+
+
+
+ + ♻ ☆ Assessing Brittleness of Image-Text Retrieval Benchmarks from + Vision-Language Models Perspective + + +
+ We examine the brittleness of the image-text retrieval (ITR) evaluation +pipeline with a focus on concept granularity. We start by analyzing two common +benchmarks, MS-COCO and Flickr30k, and compare them with augmented, +fine-grained versions, MS-COCO-FG and Flickr30k-FG, given a specified set of +linguistic features capturing concept granularity. Flickr30k-FG and MS COCO-FG +consistently give rise to higher scores across all the selected features. To +further our understanding of the impact of granularity we consider a novel +taxonomy of query perturbations. We apply these perturbations to the selected +datasets. We evaluate four diverse state-of-the-art Vision-Language models on +both the standard and fine-grained datasets under zero-shot conditions, with +and without the applied perturbations. The results demonstrate that although +perturbations generally degrade model performance, the fine-grained datasets +exhibit a smaller performance drop than their standard counterparts. The +relative performance drop across all setups is consistent across all models and +datasets, indicating that the issue lies within the benchmarks themselves. We +conclude by providing an agenda for improving ITR evaluation pipelines. + +
+
+
+
+
+ + ♻ ☆ Sentiment-Driven Community Detection in a Network of Perfume Preferences + + +
+ Network analysis is increasingly important across various fields, including +the fragrance industry, where perfumes are represented as nodes and shared user +preferences as edges in perfume networks. Community detection can uncover +clusters of similar perfumes, providing insights into consumer preferences, +enhancing recommendation systems, and informing targeted marketing strategies. + This study aims to apply community detection techniques to group perfumes +favored by users into relevant clusters for better recommendations. We +constructed a bipartite network from user reviews on the Persian retail +platform "Atrafshan," with nodes representing users and perfumes, and edges +formed by positive comments. This network was transformed into a Perfume +Co-Preference Network, connecting perfumes liked by the same users. By applying +community detection algorithms, we identified clusters based on shared +preferences, enhancing our understanding of user sentiment in the fragrance +market. + To improve sentiment analysis, we integrated emojis and a user voting system +for greater accuracy. Emojis, aligned with their Persian counterparts, captured +the emotional tone of reviews, while user ratings for scent, longevity, and +sillage refined sentiment classification. Edge weights were adjusted by +combining adjacency values with user ratings in a 60:40 ratio, reflecting both +connection strength and user preferences. These enhancements led to improved +modularity of detected communities, resulting in more accurate perfume +groupings. + This research pioneers the use of community detection in perfume networks, +offering new insights into consumer preferences. Our advancements in sentiment +analysis and edge weight refinement provide actionable insights for optimizing +product recommendations and marketing strategies in the fragrance industry. + +
+
+
+
+
+ + ♻ ☆ Transforming Location Retrieval at Airbnb: A Journey from Heuristics to + Reinforcement Learning CIKM 2024 + + +
+ The Airbnb search system grapples with many unique challenges as it continues +to evolve. We oversee a marketplace that is nuanced by geography, diversity of +homes, and guests with a variety of preferences. Crafting an efficient search +system that can accommodate diverse guest needs, while showcasing relevant +homes lies at the heart of Airbnb's success. Airbnb search has many challenges +that parallel other recommendation and search systems but it has a unique +information retrieval problem, upstream of ranking, called location retrieval. +It requires defining a topological map area that is relevant to the searched +query for homes listing retrieval. The purpose of this paper is to demonstrate +the methodology, challenges, and impact of building a machine learning based +location retrieval product from the ground up. Despite the lack of suitable, +prevalent machine learning based approaches, we tackle cold start, +generalization, differentiation and algorithmic bias. We detail the efficacy of +heuristics, statistics, machine learning, and reinforcement learning approaches +to solve these challenges, particularly for systems that are often unexplored +by current literature. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL + Benchmark + + +
+ Text-to-SQL systems have become crucial for translating natural language into +SQL queries in various industries, enabling non-technical users to perform +complex data operations. The need for accurate evaluation methods has increased +as these systems have grown more sophisticated. However, the Execution Accuracy +(EX), the most prevalent evaluation metric, still shows many false positives +and negatives. Thus, this paper introduces FLEX (False-Less EXecution), a novel +approach to evaluating text-to-SQL systems using large language models (LLMs) +to emulate human expert-level evaluation of SQL queries. Our metric improves +agreement with human experts (from 62 to 87.04 in Cohen's kappa) with +comprehensive context and sophisticated criteria. Our extensive experiments +yield several key insights: (1) Models' performance increases by over 2.6 +points on average, substantially affecting rankings on Spider and BIRD +benchmarks; (2) The underestimation of models in EX primarily stems from +annotation quality issues; and (3) Model performance on particularly +challenging questions tends to be overestimated. This work contributes to a +more accurate and nuanced evaluation of text-to-SQL systems, potentially +reshaping our understanding of state-of-the-art performance in this field. + +
+
+ comment: preprint, under review +
+
+
+
+
+ + ♻ ☆ OpenResearcher: Unleashing AI for Accelerated Scientific Research EMNLP 2024 + + +
+ The rapid growth of scientific literature imposes significant challenges for +researchers endeavoring to stay updated with the latest advancements in their +fields and delve into new areas. We introduce OpenResearcher, an innovative +platform that leverages Artificial Intelligence (AI) techniques to accelerate +the research process by answering diverse questions from researchers. +OpenResearcher is built based on Retrieval-Augmented Generation (RAG) to +integrate Large Language Models (LLMs) with up-to-date, domain-specific +knowledge. Moreover, we develop various tools for OpenResearcher to understand +researchers' queries, search from the scientific literature, filter retrieved +information, provide accurate and comprehensive answers, and self-refine these +answers. OpenResearcher can flexibly use these tools to balance efficiency and +effectiveness. As a result, OpenResearcher enables researchers to save time and +increase their potential to discover new insights and drive scientific +breakthroughs. Demo, video, and code are available at: +https://github.com/GAIR-NLP/OpenResearcher. + +
+
+ comment: Accepted to Demo track of EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Better Late Than Never: Formulating and Benchmarking Recommendation + Editing + + +
+ Recommendation systems play a pivotal role in suggesting items to users based +on their preferences. However, in online platforms, these systems inevitably +offer unsuitable recommendations due to limited model capacity, poor data +quality, or evolving user interests. Enhancing user experience necessitates +efficiently rectify such unsuitable recommendation behaviors. This paper +introduces a novel and significant task termed recommendation editing, which +focuses on modifying known and unsuitable recommendation behaviors. +Specifically, this task aims to adjust the recommendation model to eliminate +known unsuitable items without accessing training data or retraining the model. +We formally define the problem of recommendation editing with three primary +objectives: strict rectification, collaborative rectification, and concentrated +rectification. Three evaluation metrics are developed to quantitatively assess +the achievement of each objective. We present a straightforward yet effective +benchmark for recommendation editing using novel Editing Bayesian Personalized +Ranking Loss. To demonstrate the effectiveness of the proposed method, we +establish a comprehensive benchmark that incorporates various methods from +related fields. Codebase is available at +https://github.com/cycl2018/Recommendation-Editing. + +
+
+
+
+
+ + ♻ ☆ Information Extraction in Low-Resource Scenarios: Survey and Perspective + + +
+ Information Extraction (IE) seeks to derive structured information from +unstructured texts, often facing challenges in low-resource scenarios due to +data scarcity and unseen classes. This paper presents a review of neural +approaches to low-resource IE from \emph{traditional} and \emph{LLM-based} +perspectives, systematically categorizing them into a fine-grained taxonomy. +Then we conduct empirical study on LLM-based methods compared with previous +state-of-the-art models, and discover that (1) well-tuned LMs are still +predominant; (2) tuning open-resource LLMs and ICL with GPT family is promising +in general; (3) the optimal LLM-based technical solution for low-resource IE +can be task-dependent. In addition, we discuss low-resource IE with LLMs, +highlight promising applications, and outline potential research directions. +This survey aims to foster understanding of this field, inspire new ideas, and +encourage widespread applications in both academia and industry. + +
+
+ comment: Accepted by 15th IEEE International Conference on Knowledge Graphs + (ICKG2024). Paper List: + \url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \url{ + https://github.com/mayubo2333/LLM_project} +
+
+
+
+
+
+
+
+ + Multimedia 12 + +
+
+
+ + ☆ Knowledge Distillation for Real-Time Classification of Early Media in + Voice Communications + + +
+ This paper investigates the industrial setting of real-time classification of +early media exchanged during the initialization phase of voice calls. We +explore the application of state-of-the-art audio tagging models and highlight +some limitations when applied to the classification of early media. While most +existing approaches leverage convolutional neural networks, we propose a novel +approach for low-resource requirements based on gradient-boosted trees. Our +approach not only demonstrates a substantial improvement in runtime +performance, but also exhibits a comparable accuracy. We show that leveraging +knowledge distillation and class aggregation techniques to train a simpler and +smaller model accelerates the classification of early media in voice calls. We +provide a detailed analysis of the results on a proprietary and publicly +available dataset, regarding accuracy and runtime performance. We additionally +report a case study of the achieved performance improvements at a regional data +center in India. + +
+
+
+
+
+ + ☆ OmniSep: Unified Omni-Modality Sound Separation with Query-Mixup + + +
+ The scaling up has brought tremendous success in the fields of vision and +language in recent years. When it comes to audio, however, researchers +encounter a major challenge in scaling up the training data, as most natural +audio contains diverse interfering signals. To address this limitation, we +introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of +isolating clean soundtracks based on omni-modal queries, encompassing both +single-modal and multi-modal composed queries. Specifically, we introduce the +Query-Mixup strategy, which blends query features from different modalities +during training. This enables OmniSep to optimize multiple modalities +concurrently, effectively bringing all modalities under a unified framework for +sound separation. We further enhance this flexibility by allowing queries to +influence sound separation positively or negatively, facilitating the retention +or removal of specific sounds as desired. Finally, OmniSep employs a +retrieval-augmented approach known as Query-Aug, which enables open-vocabulary +sound separation. Experimental evaluations on MUSIC, VGGSOUND-CLEAN+, and +MUSIC-CLEAN+ datasets demonstrate effectiveness of OmniSep, achieving +state-of-the-art performance in text-, image-, and audio-queried sound +separation tasks. For samples and further information, please visit the demo +page at \url{https://omnisep.github.io/}. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Kandinsky 3: Text-to-Image Synthesis for Multifunctional Generative + Framework EMNLP 2024 + + +
+ Text-to-image (T2I) diffusion models are popular for introducing image +manipulation methods, such as editing, image fusion, inpainting, etc. At the +same time, image-to-video (I2V) and text-to-video (T2V) models are also built +on top of T2I models. We present Kandinsky 3, a novel T2I model based on latent +diffusion, achieving a high level of quality and photorealism. The key feature +of the new architecture is the simplicity and efficiency of its adaptation for +many types of generation tasks. We extend the base T2I model for various +applications and create a multifunctional generation system that includes +text-guided inpainting/outpainting, image fusion, text-image fusion, image +variations generation, I2V and T2V generation. We also present a distilled +version of the T2I model, evaluating inference in 4 steps of the reverse +process without reducing image quality and 3 times faster than the base model. +We deployed a user-friendly demo system in which all the features can be tested +in the public domain. Additionally, we released the source code and checkpoints +for the Kandinsky 3 and extended models. Human evaluations show that Kandinsky +3 demonstrates one of the highest quality scores among open source generation +systems. + +
+
+ comment: Accepted for EMNLP 2024 (Demo track) +
+
+
+
+
+ + ☆ FairStream: Fair Multimedia Streaming Benchmark for Reinforcement + Learning Agents + + +
+ Multimedia streaming accounts for the majority of traffic in today's +internet. Mechanisms like adaptive bitrate streaming control the bitrate of a +stream based on the estimated bandwidth, ideally resulting in smooth playback +and a good Quality of Experience (QoE). However, selecting the optimal bitrate +is challenging under volatile network conditions. This motivated researchers to +train Reinforcement Learning (RL) agents for multimedia streaming. The +considered training environments are often simplified, leading to promising +results with limited applicability. Additionally, the QoE fairness across +multiple streams is seldom considered by recent RL approaches. With this work, +we propose a novel multi-agent environment that comprises multiple challenges +of fair multimedia streaming: partial observability, multiple objectives, agent +heterogeneity and asynchronicity. We provide and analyze baseline approaches +across five different traffic classes to gain detailed insights into the +behavior of the considered agents, and show that the commonly used Proximal +Policy Optimization (PPO) algorithm is outperformed by a simple greedy +heuristic. Future work includes the adaptation of multi-agent RL algorithms and +further expansions of the environment. + +
+
+
+
+
+ + ☆ Diff-Instruct*: Towards Human-Preferred One-step Text-to-image + Generative Models + + +
+ In this paper, we introduce the Diff-Instruct*(DI*), a data-free approach for +building one-step text-to-image generative models that align with human +preference while maintaining the ability to generate highly realistic images. +We frame human preference alignment as online reinforcement learning using +human feedback (RLHF), where the goal is to maximize the reward function while +regularizing the generator distribution to remain close to a reference +diffusion process. Unlike traditional RLHF approaches, which rely on the KL +divergence for regularization, we introduce a novel score-based divergence +regularization, which leads to significantly better performances. Although the +direct calculation of this divergence remains intractable, we demonstrate that +we can efficiently compute its \emph{gradient} by deriving an equivalent yet +tractable loss function. Remarkably, with Stable Diffusion V1.5 as the +reference diffusion model, DI* outperforms \emph{all} previously leading models +by a large margin. When using the 0.6B PixelArt-$\alpha$ model as the reference +diffusion, DI* achieves a new record Aesthetic Score of 6.30 and an Image +Reward of 1.31 with only a single generation step, almost doubling the scores +of the rest of the models with similar sizes. It also achieves an HPSv2 score +of 28.70, establishing a new state-of-the-art benchmark. We also observe that +DI* can improve the layout and enrich the colors of generated images. + +
+
+
+
+
+ + ☆ ByteNet: Rethinking Multimedia File Fragment Classification through + Visual Perspectives + + +
+ Multimedia file fragment classification (MFFC) aims to identify file fragment +types, e.g., image/video, audio, and text without system metadata. It is of +vital importance in multimedia storage and communication. Existing MFFC methods +typically treat fragments as 1D byte sequences and emphasize the relations +between separate bytes (interbytes) for classification. However, the more +informative relations inside bytes (intrabytes) are overlooked and seldom +investigated. By looking inside bytes, the bit-level details of file fragments +can be accessed, enabling a more accurate classification. Motivated by this, we +first propose Byte2Image, a novel visual representation model that incorporates +previously overlooked intrabyte information into file fragments and +reinterprets these fragments as 2D grayscale images. This model involves a +sliding byte window to reveal the intrabyte information and a rowwise stacking +of intrabyte ngrams for embedding fragments into a 2D space. Thus, complex +interbyte and intrabyte correlations can be mined simultaneously using powerful +vision networks. Additionally, we propose an end-to-end dual-branch network +ByteNet to enhance robust correlation mining and feature representation. +ByteNet makes full use of the raw 1D byte sequence and the converted 2D image +through a shallow byte branch feature extraction (BBFE) and a deep image branch +feature extraction (IBFE) network. In particular, the BBFE, composed of a +single fully-connected layer, adaptively recognizes the co-occurrence of +several some specific bytes within the raw byte sequence, while the IBFE, built +on a vision Transformer, effectively mines the complex interbyte and intrabyte +correlations from the converted image. Experiments on the two representative +benchmarks, including 14 cases, validate that our proposed method outperforms +state-of-the-art approaches on different cases by up to 12.2%. + +
+
+ comment: Accepted in TMM +
+
+
+
+
+ + ☆ Segmenting Watermarked Texts From Language Models NeurIPS 2024 + + +
+ Watermarking is a technique that involves embedding nearly unnoticeable +statistical signals within generated content to help trace its source. This +work focuses on a scenario where an untrusted third-party user sends prompts to +a trusted language model (LLM) provider, who then generates a text from their +LLM with a watermark. This setup makes it possible for a detector to later +identify the source of the text if the user publishes it. The user can modify +the generated text by substitutions, insertions, or deletions. Our objective is +to develop a statistical method to detect if a published text is LLM-generated +from the perspective of a detector. We further propose a methodology to segment +the published text into watermarked and non-watermarked sub-strings. The +proposed approach is built upon randomization tests and change point detection +techniques. We demonstrate that our method ensures Type I and Type II error +control and can accurately identify watermarked sub-strings by finding the +corresponding change point locations. To validate our technique, we apply it to +texts generated by several language models with prompts extracted from Google's +C4 dataset and obtain encouraging numerical results. We release all code +publicly at https://github.com/doccstat/llm-watermark-cpd. + +
+
+ comment: 25 pages, 12 figures, 2 tables, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Improving Gloss-free Sign Language Translation by Reducing + Representation Density NeurIPS'24 + + +
+ Gloss-free sign language translation (SLT) aims to develop well-performing +SLT systems with no requirement for the costly gloss annotations, but currently +still lags behind gloss-based approaches significantly. In this paper, we +identify a representation density problem that could be a bottleneck in +restricting the performance of gloss-free SLT. Specifically, the representation +density problem describes that the visual representations of semantically +distinct sign gestures tend to be closely packed together in feature space, +which makes gloss-free methods struggle with distinguishing different sign +gestures and suffer from a sharp performance drop. To address the +representation density problem, we introduce a simple but effective contrastive +learning strategy, namely SignCL, which encourages gloss-free models to learn +more discriminative feature representation in a self-supervised manner. Our +experiments demonstrate that the proposed SignCL can significantly reduce the +representation density and improve performance across various translation +frameworks. Specifically, SignCL achieves a significant improvement in BLEU +score for the Sign Language Transformer and GFSLT-VLP on the CSL-Daily dataset +by 39% and 46%, respectively, without any increase of model parameters. +Compared to Sign2GPT, a state-of-the-art method based on large-scale +pre-trained vision and language models, SignCL achieves better performance with +only 35% of its parameters. Implementation and Checkpoints are available at +https://github.com/JinhuiYE/SignCL. + +
+
+ comment: Accepted at NeurIPS'24; Representation Density Problem and + Performance Drop in Gloss-free SLT +
+
+
+
+
+ + ♻ ☆ Audio-Visual Instance Segmentation + + +
+ In this paper, we propose a new multi-modal task, termed audio-visual +instance segmentation (AVIS), which aims to simultaneously identify, segment +and track individual sounding object instances in audible videos. To facilitate +this research, we introduce a high-quality benchmark named AVISeg, containing +over 90K instance masks from 26 semantic categories in 926 long videos. +Additionally, we propose a strong baseline model for this task. Our model first +localizes sound source within each frame, and condenses object-specific +contexts into concise tokens. Then it builds long-range audio-visual +dependencies between these tokens using window-based attention, and tracks +sounding objects among the entire video sequences. Extensive experiments reveal +that our method performs best on AVISeg, surpassing the existing methods from +related tasks. We further conduct the evaluation on several multi-modal large +models; however, they exhibits subpar performance on instance-level sound +source localization and temporal perception. We expect that AVIS will inspire +the community towards a more comprehensive multi-modal understanding. + +
+
+
+
+
+ + ♻ ☆ Joint Explicit and Implicit Cross-Modal Interaction Network for Anterior + Chamber Inflammation Diagnosis + + +
+ Uveitis demands the precise diagnosis of anterior chamber inflammation (ACI) +for optimal treatment. However, current diagnostic methods only rely on a +limited single-modal disease perspective, which leads to poor performance. In +this paper, we investigate a promising yet challenging way to fuse multimodal +data for ACI diagnosis. Notably, existing fusion paradigms focus on empowering +implicit modality interactions (i.e., self-attention and its variants), but +neglect to inject explicit modality interactions, especially from clinical +knowledge and imaging property. To this end, we propose a jointly Explicit and +implicit Cross-Modal Interaction Network (EiCI-Net) for Anterior Chamber +Inflammation Diagnosis that uses anterior segment optical coherence tomography +(AS-OCT) images, slit-lamp images, and clinical data jointly. Specifically, we +first develop CNN-Based Encoders and Tabular Processing Module (TPM) to extract +efficient feature representations in different modalities. Then, we devise an +Explicit Cross-Modal Interaction Module (ECIM) to generate attention maps as a +kind of explicit clinical knowledge based on the tabular feature maps, then +integrated them into the slit-lamp feature maps, allowing the CNN-Based Encoder +to focus on more effective informativeness of the slit-lamp images. After that, +the Implicit Cross-Modal Interaction Module (ICIM), a transformer-based +network, further implicitly enhances modality interactions. Finally, we +construct a considerable real-world dataset from our collaborative hospital and +conduct sufficient experiments to demonstrate the superior performance of our +proposed EiCI-Net compared with the state-of-the-art classification methods in +various metrics. + +
+
+ comment: IEEE MedAI 2024 +
+
+
+
+
+ + ♻ ☆ Double Mixture: Towards Continual Event Detection from Speech + + +
+ Speech event detection is crucial for multimedia retrieval, involving the +tagging of both semantic and acoustic events. Traditional ASR systems often +overlook the interplay between these events, focusing solely on content, even +though the interpretation of dialogue can vary with environmental context. This +paper tackles two primary challenges in speech event detection: the continual +integration of new events without forgetting previous ones, and the +disentanglement of semantic from acoustic events. We introduce a new task, +continual event detection from speech, for which we also provide two benchmark +datasets. To address the challenges of catastrophic forgetting and effective +disentanglement, we propose a novel method, 'Double Mixture.' This method +merges speech expertise with robust memory mechanisms to enhance adaptability +and prevent forgetting. Our comprehensive experiments show that this task +presents significant challenges that are not effectively addressed by current +state-of-the-art methods in either computer vision or natural language +processing. Our approach achieves the lowest rates of forgetting and the +highest levels of generalization, proving robust across various continual +learning sequences. Our code and data are available at +https://anonymous.4open.science/status/Continual-SpeechED-6461. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ MemeCLIP: Leveraging CLIP Representations for Multimodal Meme + Classification EMNLP 2024 + + +
+ The complexity of text-embedded images presents a formidable challenge in +machine learning given the need for multimodal understanding of multiple +aspects of expression conveyed by them. While previous research in multimodal +analysis has primarily focused on singular aspects such as hate speech and its +subclasses, this study expands this focus to encompass multiple aspects of +linguistics: hate, targets of hate, stance, and humor. We introduce a novel +dataset PrideMM comprising 5,063 text-embedded images associated with the +LGBTQ+ Pride movement, thereby addressing a serious gap in existing resources. +We conduct extensive experimentation on PrideMM by using unimodal and +multimodal baseline methods to establish benchmarks for each task. +Additionally, we propose a novel framework MemeCLIP for efficient downstream +learning while preserving the knowledge of the pre-trained CLIP model. The +results of our experiments show that MemeCLIP achieves superior performance +compared to previously proposed frameworks on two real-world datasets. We +further compare the performance of MemeCLIP and zero-shot GPT-4 on the hate +classification task. Finally, we discuss the shortcomings of our model by +qualitatively analyzing misclassified samples. Our code and dataset are +publicly available at: https://github.com/SiddhantBikram/MemeCLIP. + +
+
+ comment: Accepted to EMNLP 2024 (Main) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ R^3AG: First Workshop on Refined and Reliable Retrieval Augmented + Generation SIGIR + + +
+ Retrieval-augmented generation (RAG) has gained wide attention as the key +component to improve generative models with external knowledge augmentation +from information retrieval. It has shown great prominence in enhancing the +functionality and performance of large language model (LLM)-based applications. +However, with the comprehensive application of RAG, more and more problems and +limitations have been identified, thus urgently requiring further fundamental +exploration to improve current RAG frameworks. This workshop aims to explore in +depth how to conduct refined and reliable RAG for downstream AI tasks. + To this end, we propose to organize the first R3AG workshop at SIGIR-AP 2024 +to call for participants to re-examine and formulate the basic principles and +practical implementation of refined and reliable RAG. The workshop serves as a +platform for both academia and industry researchers to conduct discussions, +share insights, and foster research to build the next generation of RAG +systems. Participants will engage in discussions and presentations focusing on +fundamental challenges, cutting-edge research, and potential pathways to +improve RAG. At the end of the workshop, we aim to have a clearer understanding +of how to improve the reliability and applicability of RAG with more robust +information retrieval and language generation. + +
+
+ comment: R^3AG workshop overview at SIGIR-AP 2024 +
+
+
+
+
+ + ☆ Coherence-guided Preference Disentanglement for Cross-domain + Recommendations + + +
+ Discovering user preferences across different domains is pivotal in +cross-domain recommendation systems, particularly when platforms lack +comprehensive user-item interactive data. The limited presence of shared users +often hampers the effective modeling of common preferences. While leveraging +shared items' attributes, such as category and popularity, can enhance +cross-domain recommendation performance, the scarcity of shared items between +domains has limited research in this area. To address this, we propose a +Coherence-guided Preference Disentanglement (CoPD) method aimed at improving +cross-domain recommendation by i) explicitly extracting shared item attributes +to guide the learning of shared user preferences and ii) disentangling these +preferences to identify specific user interests transferred between domains. +CoPD introduces coherence constraints on item embeddings of shared and specific +domains, aiding in extracting shared attributes. Moreover, it utilizes these +attributes to guide the disentanglement of user preferences into separate +embeddings for interest and conformity through a popularity-weighted loss. +Experiments conducted on real-world datasets demonstrate the superior +performance of our proposed CoPD over existing competitive baselines, +highlighting its effectiveness in enhancing cross-domain recommendation +performance. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Automatic Estimation of Singing Voice Musical Dynamics + + +
+ Musical dynamics form a core part of expressive singing voice performances. +However, automatic analysis of musical dynamics for singing voice has received +limited attention partly due to the scarcity of suitable datasets and a lack of +clear evaluation frameworks. To address this challenge, we propose a +methodology for dataset curation. Employing the proposed methodology, we +compile a dataset comprising 509 musical dynamics annotated singing voice +performances, aligned with 163 score files, leveraging state-of-the-art source +separation and alignment techniques. The scores are sourced from the OpenScore +Lieder corpus of romantic-era compositions, widely known for its wealth of +expressive annotations. Utilizing the curated dataset, we train a multi-head +attention based CNN model with varying window sizes to evaluate the +effectiveness of estimating musical dynamics. We explored two distinct +perceptually motivated input representations for the model training: log-Mel +spectrum and bark-scale based features. For testing, we manually curate another +dataset of 25 musical dynamics annotated performances in collaboration with a +professional vocalist. We conclude through our experiments that bark-scale +based features outperform log-Mel-features for the task of singing voice +dynamics prediction. The dataset along with the code is shared publicly for +further research on the topic. + +
+
+ comment: To be published in ISMIR 2024, 6 pages +
+
+
+
+
+ + ☆ Prototypical Extreme Multi-label Classification with a Dynamic Margin + Loss + + +
+ Extreme Multi-label Classification (XMC) methods predict relevant labels for +a given query in an extremely large label space. Recent works in XMC address +this problem using deep encoders that project text descriptions to an embedding +space suitable for recovering the closest labels. However, learning deep models +can be computationally expensive in large output spaces, resulting in a +trade-off between high performing brute-force approaches and efficient +solutions. In this paper, we propose PRIME, a XMC method that employs a novel +prototypical contrastive learning technique to reconcile efficiency and +performance surpassing brute-force approaches. We frame XMC as a +data-to-prototype prediction task where label prototypes aggregate information +from related queries. More precisely, we use a shallow transformer encoder that +we coin as Label Prototype Network, which enriches label representations by +aggregating text-based embeddings, label centroids and learnable free vectors. +We jointly train a deep encoder and the Label Prototype Network using an +adaptive triplet loss objective that better adapts to the high granularity and +ambiguity of extreme label spaces. PRIME achieves state-of-the-art results in +several public benchmarks of different sizes and domains, while keeping the +model efficient. + +
+
+
+
+
+ + ☆ Efficient and Effective Retrieval of Dense-Sparse Hybrid Vectors using + Graph-based Approximate Nearest Neighbor Search + + +
+ ANNS for embedded vector representations of texts is commonly used in +information retrieval, with two important information representations being +sparse and dense vectors. While it has been shown that combining these +representations improves accuracy, the current method of conducting sparse and +dense vector searches separately suffers from low scalability and high system +complexity. Alternatively, building a unified index faces challenges with +accuracy and efficiency. To address these issues, we propose a graph-based ANNS +algorithm for dense-sparse hybrid vectors. Firstly, we propose a distribution +alignment method to improve accuracy, which pre-samples dense and sparse +vectors to analyze their distance distribution statistic, resulting in a +1%$\sim$9% increase in accuracy. Secondly, to improve efficiency, we design an +adaptive two-stage computation strategy that initially computes dense distances +only and later computes hybrid distances. Further, we prune the sparse vectors +to speed up the calculation. Compared to naive implementation, we achieve +$\sim2.1\times$ acceleration. Thorough experiments show that our algorithm +achieves 8.9x$\sim$11.7x throughput at equal accuracy compared to existing +hybrid vector search algorithms. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ An approach to hummed-tune and song sequences matching + + +
+ Melody stuck in your head, also known as "earworm", is tough to get rid of, +unless you listen to it again or sing it out loud. But what if you can not find +the name of that song? It must be an intolerable feeling. Recognizing a song +name base on humming sound is not an easy task for a human being and should be +done by machines. However, there is no research paper published about hum tune +recognition. Adapting from Hum2Song Zalo AI Challenge 2021 - a competition +about querying the name of a song by user's giving humming tune, which is +similar to Google's Hum to Search. This paper covers details about the +pre-processed data from the original type (mp3) to usable form for training and +inference. In training an embedding model for the feature extraction phase, we +ran experiments with some states of the art, such as ResNet, VGG, AlexNet, +MobileNetV2. And for the inference phase, we use the Faiss module to +effectively search for a song that matched the sequence of humming sound. The +result comes at nearly 94\% in MRR@10 metric on the public test set, along with +the top 1 result on the public leaderboard. + +
+
+
+
+
+ + ☆ WindTunnel -- A Framework for Community Aware Sampling of Large Corpora + + +
+ Conducting comprehensive information retrieval experiments, such as in search +or retrieval augmented generation, often comes with high computational costs. +This is because evaluating a retrieval algorithm requires indexing the entire +corpus, which is significantly larger than the set of (query, result) pairs +under evaluation. This issue is especially pronounced in big data and neural +retrieval, where indexing becomes increasingly time-consuming and complex. In +this paper, we present WindTunnel, a novel framework developed at Yext to +generate representative samples of large corpora, enabling efficient end-to-end +information retrieval experiments. By preserving the community structure of the +dataset, WindTunnel overcomes limitations in current sampling methods, +providing more accurate evaluations. + +
+
+
+
+
+ + ♻ ☆ Geometric Collaborative Filtering with Convergence + + +
+ Latent variable collaborative filtering methods have been a standard approach +to modelling user-click interactions due to their simplicity and effectiveness. +However, there is limited work on analyzing the mathematical properties of +these methods in particular on preventing the overfitting towards the identity, +and such methods typically utilize loss functions that overlook the geometry +between items. In this work, we introduce a notion of generalization gap in +collaborative filtering and analyze this with respect to latent collaborative +filtering models. We present a geometric upper bound that gives rise to loss +functions, and a way to meaningfully utilize the geometry of item-metadata to +improve recommendations. We show how these losses can be minimized and gives +the recipe to a new latent collaborative filtering algorithm, which we refer to +as GeoCF, due to the geometric nature of our results. We then show +experimentally that our proposed GeoCF algorithm can outperform other all +existing methods on the Movielens20M and Netflix datasets, as well as two +large-scale internal datasets. In summary, our work proposes a theoretically +sound method which paves a way to better understand generalization of +collaborative filtering at large. + +
+
+ comment: 13 pages, 1 figure, 3 tables +
+
+
+
+
+ + ♻ ☆ Personalized Summarization of Scientific Scholarly Texts + + +
+ In this paper, we present a proposal for an unsupervised algorithm, P-Summ, +that generates an extractive summary of scientific scholarly text to meet the +personal knowledge needs of the user. The method delves into the latent +semantic space of the document exposed by Weighted Non-negative Matrix +Factorization, and scores sentences in consonance with the knowledge needs of +the user. The novelty of the algorithm lies in its ability to include desired +knowledge and eliminate unwanted knowledge in the personal summary. + We also propose a multi-granular evaluation framework, which assesses the +quality of generated personal summaries at three levels of granularity - +sentence, terms and semantic. The framework uses system generated generic +summary instead of human generated summary as gold standard for evaluating the +quality of personal summary generated by the algorithm. The effectiveness of +the algorithm at the semantic level is evaluated by taking into account the +reference summary and the knowledge signals. We evaluate the performance of +P-Summ algorithm over four data-sets consisting of scientific articles. Our +empirical investigations reveal that the proposed method has the capability to +meet negative (or positive) knowledge preferences of the user. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ MidiTok Visualizer: a tool for visualization and analysis of tokenized + MIDI symbolic music + + +
+ Symbolic music research plays a crucial role in music-related machine +learning, but MIDI data can be complex for those without musical expertise. To +address this issue, we present MidiTok Visualizer, a web application designed +to facilitate the exploration and visualization of various MIDI tokenization +methods from the MidiTok Python package. MidiTok Visualizer offers numerous +customizable parameters, enabling users to upload MIDI files to visualize +tokenized data alongside an interactive piano roll. + +
+
+ comment: in Extended Abstracts for the Late-Breaking Demo Sessionof the 25th + Int. Society for Music Information Retrieval Conf., San Francisco, United + States, 2024 +
+
+
+
+
+ + ♻ ☆ Images that Sound: Composing Images and Sounds on a Single Canvas NeurIPS 2024 + + +
+ Spectrograms are 2D representations of sound that look very different from +the images found in our visual world. And natural images, when played as +spectrograms, make unnatural sounds. In this paper, we show that it is possible +to synthesize spectrograms that simultaneously look like natural images and +sound like natural audio. We call these visual spectrograms images that sound. +Our approach is simple and zero-shot, and it leverages pre-trained +text-to-image and text-to-spectrogram diffusion models that operate in a shared +latent space. During the reverse process, we denoise noisy latents with both +the audio and image diffusion models in parallel, resulting in a sample that is +likely under both models. Through quantitative evaluations and perceptual +studies, we find that our method successfully generates spectrograms that align +with a desired audio prompt while also taking the visual appearance of a +desired image prompt. Please see our project page for video results: +https://ificl.github.io/images-that-sound/ + +
+
+ comment: Accepted to NeurIPS 2024. Project site: + https://ificl.github.io/images-that-sound/ +
+
+
+
+
+ + ♻ ☆ Veagle: Advancements in Multimodal Representation Learning + + +
+ Lately, researchers in artificial intelligence have been really interested in +how language and vision come together, giving rise to the development of +multimodal models that aim to seamlessly integrate textual and visual +information. Multimodal models, an extension of Large Language Models (LLMs), +have exhibited remarkable capabilities in addressing a diverse array of tasks, +ranging from image captioning and visual question answering (VQA) to visual +grounding. While these models have showcased significant advancements, +challenges persist in accurately interpreting images and answering the +question, a common occurrence in real-world scenarios. This paper introduces a +novel approach to enhance the multimodal capabilities of existing models. In +response to the limitations observed in current Vision Language Models (VLMs) +and Multimodal Large Language Models (MLLMs), our proposed model Veagle, +incorporates a unique mechanism inspired by the successes and insights of +previous works. Veagle leverages a dynamic mechanism to project encoded visual +information directly into the language model. This dynamic approach allows for +a more nuanced understanding of intricate details present in visual contexts. +To validate the effectiveness of Veagle, we conduct comprehensive experiments +on benchmark datasets, emphasizing tasks such as visual question answering and +image understanding. Our results indicate a improvement of 5-6 \% in +performance, with Veagle outperforming existing models by a notable margin. The +outcomes underscore the model's versatility and applicability beyond +traditional benchmarks. + +
+
+
+
+
+ + ♻ ☆ Frieren: Efficient Video-to-Audio Generation Network with Rectified Flow + Matching NeurIPS 2024 + + +
+ Video-to-audio (V2A) generation aims to synthesize content-matching audio +from silent video, and it remains challenging to build V2A models with high +generation quality, efficiency, and visual-audio temporal synchrony. We propose +Frieren, a V2A model based on rectified flow matching. Frieren regresses the +conditional transport vector field from noise to spectrogram latent with +straight paths and conducts sampling by solving ODE, outperforming +autoregressive and score-based models in terms of audio quality. By employing a +non-autoregressive vector field estimator based on a feed-forward transformer +and channel-level cross-modal feature fusion with strong temporal alignment, +our model generates audio that is highly synchronized with the input video. +Furthermore, through reflow and one-step distillation with guided vector field, +our model can generate decent audio in a few, or even only one sampling step. +Experiments indicate that Frieren achieves state-of-the-art performance in both +generation quality and temporal alignment on VGGSound, with alignment accuracy +reaching 97.22%, and 6.2% improvement in inception score over the strong +diffusion-based baseline. Audio samples are available at +http://frieren-v2a.github.io. + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`