From dc2ba07dcabc20ce3a11688fc0e02d5712affd63 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sat, 28 Sep 2024 05:25:50 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 23191 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 23586 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..70cae041 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12822v3","updated":"2024-09-26T17:39:44Z","published":"2024-06-18T17:43:47Z","title":"Is It Good Data for Multilingual Instruction Tuning or Just Bad\n Multilingual Evaluation for Large Language Models?","summary":" Multilingual large language models are designed, claimed, and expected to\ncater to speakers of varied languages. We hypothesise that the current\npractices of fine-tuning and evaluating these models may not perfectly align\nwith this objective owing to a heavy reliance on translation, which cannot\ncover language-specific knowledge but can introduce translation defects. It\nremains unknown whether the nature of the instruction data has an impact on the\nmodel output; conversely, it is questionable whether translated test sets can\ncapture such nuances. Due to the often coupled practices of using translated\ndata in both stages, such imperfections could have been overlooked. This work\ninvestigates these issues using controlled native or translated data during the\ninstruction tuning and evaluation stages. We show that native or generation\nbenchmarks reveal a notable difference between native and translated\ninstruction data especially when model performance is high, whereas other types\nof test sets cannot. The comparison between round-trip and single-pass\ntranslations reflects the importance of knowledge from language-native\nresources. Finally, we demonstrate that regularization is beneficial to\nbridging this gap on structured but not generative tasks.\n","authors":["Pinzhen Chen","Simon Yu","Zhicheng Guo","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2406.12822v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18044v1","updated":"2024-09-26T16:46:46Z","published":"2024-09-26T16:46:46Z","title":"Unveiling the Role of Pretraining in Direct Speech Translation","summary":" Direct speech-to-text translation systems encounter an important drawback in\ndata scarcity. A common solution consists on pretraining the encoder on\nautomatic speech recognition, hence losing efficiency in the training process.\nIn this study, we compare the training dynamics of a system using a pretrained\nencoder, the conventional approach, and one trained from scratch. We observe\nthat, throughout the training, the randomly initialized model struggles to\nincorporate information from the speech inputs for its predictions. Hence, we\nhypothesize that this issue stems from the difficulty of effectively training\nan encoder for direct speech translation. While a model trained from scratch\nneeds to learn acoustic and semantic modeling simultaneously, a pretrained one\ncan just focus on the latter. Based on these findings, we propose a subtle\nchange in the decoder cross-attention to integrate source information from\nearlier steps in training. We show that with this change, the model trained\nfrom scratch can achieve comparable performance to the pretrained one, while\nreducing the training time.\n","authors":["Belen Alastruey","Gerard I. Gállego","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2409.18044v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2409.18033v1","updated":"2024-09-26T16:38:56Z","published":"2024-09-26T16:38:56Z","title":"Automated Detection and Analysis of Power Words in Persuasive Text Using\n Natural Language Processing","summary":" Power words are terms that evoke strong emotional responses and significantly\ninfluence readers' behavior, playing a crucial role in fields like marketing,\npolitics, and motivational writing. This study proposes a methodology for the\nautomated detection and analysis of power words in persuasive text using a\ncustom lexicon and the TextBlob library in Python. By identifying the presence\nand frequency of power words within a given text, we aim to classify and\nanalyze their impact on sentiment and reader engagement. This research examines\ndiverse datasets across various domains to provide insights into the\neffectiveness of power words, offering practical applications for content\ncreators, advertisers, and policymakers.\n","authors":["Sahil Garje"],"pdf_url":"https://arxiv.org/pdf/2409.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13731v3","updated":"2024-09-26T16:34:35Z","published":"2024-09-10T02:00:28Z","title":"KAG: Boosting LLMs in Professional Domains via Knowledge Augmented\n Generation","summary":" The recently developed retrieval-augmented generation (RAG) technology has\nenabled the efficient construction of domain-specific applications. However, it\nalso has limitations, including the gap between vector similarity and the\nrelevance of knowledge reasoning, as well as insensitivity to knowledge logic,\nsuch as numerical values, temporal relations, expert rules, and others, which\nhinder the effectiveness of professional knowledge services. In this work, we\nintroduce a professional domain knowledge service framework called Knowledge\nAugmented Generation (KAG). KAG is designed to address the aforementioned\nchallenges with the motivation of making full use of the advantages of\nknowledge graph(KG) and vector retrieval, and to improve generation and\nreasoning performance by bidirectionally enhancing large language models (LLMs)\nand KGs through five key aspects: (1) LLM-friendly knowledge representation,\n(2) mutual-indexing between knowledge graphs and original chunks, (3)\nlogical-form-guided hybrid reasoning engine, (4) knowledge alignment with\nsemantic reasoning, and (5) model capability enhancement for KAG. We compared\nKAG with existing RAG methods in multihop question answering and found that it\nsignificantly outperforms state-of-theart methods, achieving a relative\nimprovement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We\nhave successfully applied KAG to two professional knowledge Q&A tasks of Ant\nGroup, including E-Government Q&A and E-Health Q&A, achieving significant\nimprovement in professionalism compared to RAG methods.\n","authors":["Lei Liang","Mengshu Sun","Zhengke Gui","Zhongshu Zhu","Zhouyu Jiang","Ling Zhong","Yuan Qu","Peilong Zhao","Zhongpu Bo","Jin Yang","Huaidong Xiong","Lin Yuan","Jun Xu","Zaoyang Wang","Zhiqiang Zhang","Wen Zhang","Huajun Chen","Wenguang Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13731v3.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.18028v1","updated":"2024-09-26T16:34:35Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18023v1","updated":"2024-09-26T16:31:50Z","published":"2024-09-26T16:31:50Z","title":"DARE: Diverse Visual Question Answering with Robustness Evaluation","summary":" Vision Language Models (VLMs) extend remarkable capabilities of text-only\nlarge language models and vision-only models, and are able to learn from and\nprocess multi-modal vision-text input. While modern VLMs perform well on a\nnumber of standard image classification and image-text matching tasks, they\nstill struggle with a number of crucial vision-language (VL) reasoning\nabilities such as counting and spatial reasoning. Moreover, while they might be\nvery brittle to small variations in instructions and/or evaluation protocols,\nexisting benchmarks fail to evaluate their robustness (or rather the lack of\nit). In order to couple challenging VL scenarios with comprehensive robustness\nevaluation, we introduce DARE, Diverse Visual Question Answering with\nRobustness Evaluation, a carefully created and curated multiple-choice VQA\nbenchmark. DARE evaluates VLM performance on five diverse categories and\nincludes four robustness-oriented evaluations based on the variations of:\nprompts, the subsets of answer options, the output format and the number of\ncorrect answers. Among a spectrum of other findings, we report that\nstate-of-the-art VLMs still struggle with questions in most categories and are\nunable to consistently deliver their peak performance across the tested\nrobustness evaluations. The worst case performance across the subsets of\noptions is up to 34% below the performance in the standard case. The robustness\nof the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the\nclosed-source models such as GPT-4 and Gemini, but even the latter remain very\nbrittle to different variations.\n","authors":["Hannah Sterz","Jonas Pfeiffer","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2409.18023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18006v1","updated":"2024-09-26T16:15:14Z","published":"2024-09-26T16:15:14Z","title":"Multilingual Evaluation of Long Context Retrieval and Reasoning","summary":" Recent large language models (LLMs) demonstrate impressive capabilities in\nhandling long contexts, some exhibiting near-perfect recall on synthetic\nretrieval tasks. However, these evaluations have mainly focused on English text\nand involved a single target sentence within lengthy contexts. Our work\ninvestigates how LLM performance generalizes to multilingual settings with\nmultiple hidden target sentences. We comprehensively evaluate several\nlong-context LLMs on retrieval and reasoning tasks across five languages:\nEnglish, Vietnamese, Indonesian, Swahili, and Somali. These languages share the\nLatin script but belong to distinct language families and resource levels. Our\nanalysis reveals a significant performance gap between languages. The\nbest-performing models such as Gemini-1.5 and GPT-4o, achieve around 96%\naccuracy in English to around 36% in Somali with a single target sentence.\nHowever, this accuracy drops to 40% in English and 0% in Somali when dealing\nwith three target sentences. Our findings highlight the challenges long-context\nLLMs face when processing longer contexts, an increase in the number of target\nsentences, or languages of lower resource levels.\n","authors":["Ameeta Agrawal","Andy Dang","Sina Bagheri Nezhad","Rhitabrat Pokharel","Russell Scheinberg"],"pdf_url":"https://arxiv.org/pdf/2409.18006v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.17990v1","updated":"2024-09-26T16:02:00Z","published":"2024-09-26T16:02:00Z","title":"Extracting Affect Aggregates from Longitudinal Social Media Data with\n Temporal Adapters for Large Language Models","summary":" This paper proposes temporally aligned Large Language Models (LLMs) as a tool\nfor longitudinal analysis of social media data. We fine-tune Temporal Adapters\nfor Llama 3 8B on full timelines from a panel of British Twitter users, and\nextract longitudinal aggregates of emotions and attitudes with established\nquestionnaires. We validate our estimates against representative British survey\ndata and find strong positive, significant correlations for several collective\nemotions. The obtained estimates are robust across multiple training seeds and\nprompt formulations, and in line with collective emotions extracted using a\ntraditional classification model trained on labeled data. To the best of our\nknowledge, this is the first work to extend the analysis of affect in LLMs to a\nlongitudinal setting through Temporal Adapters. Our work enables new approaches\ntowards the longitudinal analysis of social media data.\n","authors":["Georg Ahnert","Max Pellert","David Garcia","Markus Strohmaier"],"pdf_url":"https://arxiv.org/pdf/2409.17990v1.pdf","comment":"Code available at https://github.com/dess-mannheim/temporal-adapters"},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2305.11231v2","updated":"2024-09-26T15:32:47Z","published":"2023-05-18T18:00:44Z","title":"Recent Trends in Unsupervised Summarization","summary":" Unsupervised summarization is a powerful technique that enables training\nsummarizing models without requiring labeled datasets. This survey covers\ndifferent recent techniques and models used for unsupervised summarization. We\ncover extractive, abstractive, and hybrid models and strategies used to achieve\nunsupervised summarization. While the main focus of this survey is on recent\nresearch, we also cover some of the important previous research. We\nadditionally introduce a taxonomy, classifying different research based on\ntheir approach to unsupervised training. Finally, we discuss the current\napproaches and mention some datasets and evaluation methods.\n","authors":["Mohammad Khosravani","Amine Trabelsi"],"pdf_url":"https://arxiv.org/pdf/2305.11231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17946v1","updated":"2024-09-26T15:20:37Z","published":"2024-09-26T15:20:37Z","title":"Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge\n Distillation","summary":" Despite being widely applied due to their exceptional capabilities, Large\nLanguage Models (LLMs) have been proven to be vulnerable to backdoor attacks.\nThese attacks introduce targeted vulnerabilities into LLMs by poisoning\ntraining samples and full-parameter fine-tuning. However, this kind of backdoor\nattack is limited since they require significant computational resources,\nespecially as the size of LLMs increases. Besides, parameter-efficient\nfine-tuning (PEFT) offers an alternative but the restricted parameter updating\nmay impede the alignment of triggers with target labels. In this study, we\nfirst verify that backdoor attacks with PEFT may encounter challenges in\nachieving feasible performance. To address these issues and improve the\neffectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack\nalgorithm from weak to strong based on contrastive knowledge distillation\n(W2SAttack). Specifically, we poison small-scale language models through\nfull-parameter fine-tuning to serve as the teacher model. The teacher model\nthen covertly transfers the backdoor to the large-scale student model through\ncontrastive knowledge distillation, which employs PEFT. Theoretical analysis\nreveals that W2SAttack has the potential to augment the effectiveness of\nbackdoor attacks. We demonstrate the superior performance of W2SAttack on\nclassification tasks across four language models, four backdoor attack\nalgorithms, and two different architectures of teacher models. Experimental\nresults indicate success rates close to 100% for backdoor attacks targeting\nPEFT.\n","authors":["Shuai Zhao","Leilei Gan","Zhongliang Guo","Xiaobao Wu","Luwei Xiao","Xiaoyu Xu","Cong-Duy Nguyen","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2409.17946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17929v1","updated":"2024-09-26T15:08:17Z","published":"2024-09-26T15:08:17Z","title":"The Lou Dataset -- Exploring the Impact of Gender-Fair Language in\n German Text Classification","summary":" Gender-fair language, an evolving German linguistic variation, fosters\ninclusion by addressing all genders or using neutral forms. Nevertheless, there\nis a significant lack of resources to assess the impact of this linguistic\nshift on classification using language models (LMs), which are probably not\ntrained on such variations. To address this gap, we present Lou, the first\ndataset featuring high-quality reformulations for German text classification\ncovering seven tasks, like stance detection and toxicity classification.\nEvaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair\nlanguage substantially impacts predictions by flipping labels, reducing\ncertainty, and altering attention patterns. However, existing evaluations\nremain valid, as LM rankings of original and reformulated instances do not\nsignificantly differ. While we offer initial insights on the effect on German\ntext classification, the findings likely apply to other languages, as\nconsistent patterns were observed in multi-lingual and English LMs.\n","authors":["Andreas Waldis","Joel Birrer","Anne Lauscher","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2409.17929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17928v1","updated":"2024-09-26T15:07:30Z","published":"2024-09-26T15:07:30Z","title":"Pioneering Reliable Assessment in Text-to-Image Knowledge Editing:\n Leveraging a Fine-Grained Dataset and an Innovative Criterion","summary":" During pre-training, the Text-to-Image (T2I) diffusion models encode factual\nknowledge into their parameters. These parameterized facts enable realistic\nimage generation, but they may become obsolete over time, thereby\nmisrepresenting the current state of the world. Knowledge editing techniques\naim to update model knowledge in a targeted way. However, facing the dual\nchallenges posed by inadequate editing datasets and unreliable evaluation\ncriterion, the development of T2I knowledge editing encounter difficulties in\neffectively generalizing injected knowledge. In this work, we design a T2I\nknowledge editing framework by comprehensively spanning on three phases: First,\nwe curate a dataset \\textbf{CAKE}, comprising paraphrase and multi-object test,\nto enable more fine-grained assessment on knowledge generalization. Second, we\npropose a novel criterion, \\textbf{adaptive CLIP threshold}, to effectively\nfilter out false successful images under the current criterion and achieve\nreliable editing evaluation. Finally, we introduce \\textbf{MPE}, a simple but\neffective approach for T2I knowledge editing. Instead of tuning parameters, MPE\nprecisely recognizes and edits the outdated part of the conditioning\ntext-prompt to accommodate the up-to-date knowledge. A straightforward\nimplementation of MPE (Based on in-context learning) exhibits better overall\nperformance than previous model editors. We hope these efforts can further\npromote faithful evaluation of T2I knowledge editing methods.\n","authors":["Hengrui Gu","Kaixiong Zhou","Yili Wang","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17928v1.pdf","comment":"EMNLP24 Findings"},{"id":"http://arxiv.org/abs/2409.17912v1","updated":"2024-09-26T14:56:38Z","published":"2024-09-26T14:56:38Z","title":"Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan\n Arabic Dialect","summary":" We introduce Atlas-Chat, the first-ever collection of large language models\nspecifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also\nknown as Darija, we construct our instruction dataset by consolidating existing\nDarija language resources, creating novel datasets both manually and\nsynthetically, and translating English instructions with stringent quality\ncontrol. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit\nsuperior ability in following Darija instructions and performing standard NLP\ntasks. Notably, our models outperform both state-of-the-art and\nArabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13%\nperformance boost over a larger 13B model on DarijaMMLU, in our newly\nintroduced evaluation suite for Darija covering both discriminative and\ngenerative tasks. Furthermore, we perform an experimental analysis of various\nfine-tuning strategies and base model choices to determine optimal\nconfigurations. All our resources are publicly accessible, and we believe our\nwork offers comprehensive design methodologies of instruction-tuning for\nlow-resource language variants, which are often neglected in favor of data-rich\nlanguages by contemporary LLMs.\n","authors":["Guokan Shang","Hadi Abdine","Yousef Khoubrane","Amr Mohamed","Yassine Abbahaddou","Sofiane Ennadir","Imane Momayiz","Xuguang Ren","Eric Moulines","Preslav Nakov","Michalis Vazirgiannis","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2409.17912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18789v2","updated":"2024-09-26T14:48:42Z","published":"2024-07-26T14:52:37Z","title":"Granularity is crucial when applying differential privacy to text: An\n investigation for neural machine translation","summary":" Applying differential privacy (DP) by means of the DP-SGD algorithm to\nprotect individual data points during training is becoming increasingly popular\nin NLP. However, the choice of granularity at which DP is applied is often\nneglected. For example, neural machine translation (NMT) typically operates on\nthe sentence-level granularity. From the perspective of DP, this setup assumes\nthat each sentence belongs to a single person and any two sentences in the\ntraining dataset are independent. This assumption is however violated in many\nreal-world NMT datasets, e.g., those including dialogues. For proper\napplication of DP we thus must shift from sentences to entire documents. In\nthis paper, we investigate NMT at both the sentence and document levels,\nanalyzing the privacy/utility trade-off for both scenarios, and evaluating the\nrisks of not using the appropriate privacy granularity in terms of leaking\npersonally identifiable information (PII). Our findings indicate that the\ndocument-level NMT system is more resistant to membership inference attacks,\nemphasizing the significance of using the appropriate granularity when working\nwith DP.\n","authors":["Doan Nam Long Vu","Timour Igamberdiev","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2407.18789v2.pdf","comment":"Accepted at EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2409.17892v1","updated":"2024-09-26T14:40:45Z","published":"2024-09-26T14:40:45Z","title":"EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language\n Models","summary":" In this work, we introduce EMMA-500, a large-scale multilingual language\nmodel continue-trained on texts across 546 languages designed for enhanced\nmultilingual performance, focusing on improving language coverage for\nlow-resource languages. To facilitate continual pre-training, we compile the\nMaLA corpus, a comprehensive multilingual dataset enriched with curated\ndatasets across diverse domains. Leveraging this corpus, we conduct extensive\ncontinual pre-training of the Llama 2 7B model, resulting in EMMA-500, which\ndemonstrates robust performance across a wide collection of benchmarks,\nincluding a comprehensive set of multilingual tasks and PolyWrite, an\nopen-ended generation benchmark developed in this study. Our results highlight\nthe effectiveness of continual pre-training in expanding large language models'\nlanguage capacity, particularly for underrepresented languages, demonstrating\nsignificant gains in cross-lingual transfer, task generalization, and language\nadaptability.\n","authors":["Shaoxiong Ji","Zihao Li","Indraneil Paul","Jaakko Paavola","Peiqin Lin","Pinzhen Chen","Dayyán O'Brien","Hengyu Luo","Hinrich Schütze","Jörg Tiedemann","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2409.17892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09577v2","updated":"2024-09-26T14:34:53Z","published":"2024-04-15T08:38:43Z","title":"Transformers, Contextualism, and Polysemy","summary":" The transformer architecture, introduced by Vaswani et al. (2017), is at the\nheart of the remarkable recent progress in the development of language models,\nincluding widely-used chatbots such as Chat-GPT and Claude. In this paper, I\nargue that we can extract from the way the transformer architecture works a\ntheory of the relationship between context and meaning. I call this the\ntransformer theory, and I argue that it is novel with regard to two related\nphilosophical debates: the contextualism debate regarding the extent of\ncontext-sensitivity across natural language, and the polysemy debate regarding\nhow polysemy should be captured within an account of word meaning.\n","authors":["Jumbly Grindrod"],"pdf_url":"https://arxiv.org/pdf/2404.09577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17834v1","updated":"2024-09-26T13:36:00Z","published":"2024-09-26T13:36:00Z","title":"PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent\n Representation MOdification","summary":" Due to their substantial sizes, large language models (LLMs) are typically\ndeployed within a single-backbone multi-tenant framework. In this setup, a\nsingle instance of an LLM backbone must cater to multiple users or tasks\nthrough the application of various parameter-efficient fine-tuning (PEFT)\nmodels. Despite the availability of numerous effective PEFT techniques such as\nLoRA, there remains a need for a PEFT approach that achieves both high\nefficiency during inference and competitive performance on downstream tasks. In\nthis research, we introduce a new and straightforward PEFT methodology named\n\\underline{P}rompt D\\underline{E}pen\\underline{D}ent \\underline{R}epresentation\nM\\underline{O}dification (PEDRO). The proposed method involves integrating a\nlightweight vector generator into each Transformer layer, which generates\nvectors contingent upon the input prompts. These vectors then modify the hidden\nrepresentations created by the LLM through a dot product operation, thereby\ninfluencing the semantic output and generated content of the model. Extensive\nexperimentation across a variety of tasks indicates that: (a) PEDRO surpasses\nrecent PEFT benchmarks when using a similar number of tunable parameters. (b)\nUnder the single-backbone multi-tenant deployment model, PEDRO exhibits\nsuperior efficiency compared to LoRA, indicating significant industrial\npotential.\n","authors":["Tianfang Xie","Tianjing Li","Wei Zhu","Wei Han","Yi Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17834v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.18203"},{"id":"http://arxiv.org/abs/2409.17827v1","updated":"2024-09-26T13:26:46Z","published":"2024-09-26T13:26:46Z","title":"BeanCounter: A low-toxicity, large-scale, and open dataset of\n business-oriented text","summary":" Many of the recent breakthroughs in language modeling have resulted from\nscaling effectively the same model architecture to larger datasets. In this\nvein, recent work has highlighted performance gains from increasing training\ndataset size and quality, suggesting a need for novel sources of large-scale\ndatasets. In this work, we introduce BeanCounter, a public dataset consisting\nof more than 159B tokens extracted from businesses' disclosures. We show that\nthis data is indeed novel: less than 0.1% of BeanCounter appears in Common\nCrawl-based datasets and it is an order of magnitude larger than datasets\nrelying on similar sources. Given the data's provenance, we hypothesize that\nBeanCounter is comparatively more factual and less toxic than web-based\ndatasets. Exploring this hypothesis, we find that many demographic identities\noccur with similar prevalence in BeanCounter but with significantly less toxic\ncontext relative to other datasets. To demonstrate the utility of BeanCounter,\nwe evaluate and compare two LLMs continually pre-trained on BeanCounter with\ntheir base models. We find an 18-33% reduction in toxic generation and improved\nperformance within the finance domain for the continually pretrained models.\nCollectively, our work suggests that BeanCounter is a novel source of\nlow-toxicity and high-quality domain-specific data with sufficient scale to\ntrain multi-billion parameter LLMs.\n","authors":["Siyan Wang","Bradford Levy"],"pdf_url":"https://arxiv.org/pdf/2409.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16934v2","updated":"2024-09-26T13:22:37Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17819v1","updated":"2024-09-26T13:15:18Z","published":"2024-09-26T13:15:18Z","title":"Inference-Time Language Model Alignment via Integrated Value Guidance","summary":" Large language models are typically fine-tuned to align with human\npreferences, but tuning large models is computationally intensive and complex.\nIn this work, we introduce $\\textit{Integrated Value Guidance}$ (IVG), a method\nthat uses implicit and explicit value functions to guide language model\ndecoding at token and chunk-level respectively, efficiently aligning large\nlanguage models purely at inference time. This approach circumvents the\ncomplexities of direct fine-tuning and outperforms traditional methods.\nEmpirically, we demonstrate the versatility of IVG across various tasks. In\ncontrolled sentiment generation and summarization tasks, our method\nsignificantly improves the alignment of large models using inference-time\nguidance from $\\texttt{gpt2}$-based value functions. Moreover, in a more\nchallenging instruction-following benchmark AlpacaEval 2.0, we show that both\nspecifically tuned and off-the-shelf value functions greatly improve the\nlength-controlled win rates of large models against $\\texttt{gpt-4-turbo}$\n(e.g., $19.51\\% \\rightarrow 26.51\\%$ for $\\texttt{Mistral-7B-Instruct-v0.2}$\nand $25.58\\% \\rightarrow 33.75\\%$ for $\\texttt{Mixtral-8x7B-Instruct-v0.1}$\nwith Tulu guidance).\n","authors":["Zhixuan Liu","Zhanhui Zhou","Yuanfu Wang","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.17819v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17791v1","updated":"2024-09-26T12:37:26Z","published":"2024-09-26T12:37:26Z","title":"Self-supervised Preference Optimization: Enhance Your Language Model\n with Preference Degree Awareness","summary":" Recently, there has been significant interest in replacing the reward model\nin Reinforcement Learning with Human Feedback (RLHF) methods for Large Language\nModels (LLMs), such as Direct Preference Optimization (DPO) and its variants.\nThese approaches commonly use a binary cross-entropy mechanism on pairwise\nsamples, i.e., minimizing and maximizing the loss based on preferred or\ndis-preferred responses, respectively. However, while this training strategy\nomits the reward model, it also overlooks the varying preference degrees within\ndifferent responses. We hypothesize that this is a key factor hindering LLMs\nfrom sufficiently understanding human preferences. To address this problem, we\npropose a novel Self-supervised Preference Optimization (SPO) framework, which\nconstructs a self-supervised preference degree loss combined with the alignment\nloss, thereby helping LLMs improve their ability to understand the degree of\npreference. Extensive experiments are conducted on two widely used datasets of\ndifferent tasks. The results demonstrate that SPO can be seamlessly integrated\nwith existing preference optimization methods and significantly boost their\nperformance to achieve state-of-the-art performance. We also conduct detailed\nanalyses to offer comprehensive insights into SPO, which verifies its\neffectiveness. The code is available at https://github.com/lijian16/SPO.\n","authors":["Jian Li","Haojing Huang","Yujia Zhang","Pengfei Xu","Xi Chen","Rui Song","Lida Shi","Jingwen Wang","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17791v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2403.15676v4","updated":"2024-09-26T12:18:21Z","published":"2024-03-23T01:44:57Z","title":"AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs","summary":" Zero-knowledge proof (ZKP) systems have surged attention and held a\nfundamental role in contemporary cryptography. Zero-knowledge succinct\nnon-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP\nusage, implemented through arithmetic circuit programming paradigm. However,\nunderconstrained or overconstrained circuits may lead to bugs. The former\nrefers to circuits that lack the necessary constraints, resulting in unexpected\nsolutions and causing the verifier to accept a bogus witness, and the latter\nrefers to circuits that are constrained excessively, resulting in lacking\nnecessary solutions and causing the verifier to accept no witness. This paper\nintroduces a novel approach for pinpointing two distinct types of bugs in ZKP\ncircuits. The method involves encoding the arithmetic circuit constraints to\npolynomial equation systems and solving them over finite fields by the computer\nalgebra system. The classification of verification results is refined, greatly\nenhancing the expressive power of the system. A tool, AC4, is proposed to\nrepresent the implementation of the method. Experiments show that AC4\ndemonstrates a increase in the checked ratio, showing a 29% improvement over\nPicus, a checker for Circom circuits, and a 10% improvement over\nhalo2-analyzer, a checker for halo2 circuits. Within a solvable range, the\nchecking time has also exhibited noticeable improvement, demonstrating a\nmagnitude increase compared to previous efforts.\n","authors":["Hao Chen","Guoqiang Li","Minyu Chen","Ruibang Liu","Sinka Gao"],"pdf_url":"https://arxiv.org/pdf/2403.15676v4.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17774v1","updated":"2024-09-26T12:11:28Z","published":"2024-09-26T12:11:28Z","title":"Faithfulness and the Notion of Adversarial Sensitivity in NLP\n Explanations","summary":" Faithfulness is arguably the most critical metric to assess the reliability\nof explainable AI. In NLP, current methods for faithfulness evaluation are\nfraught with discrepancies and biases, often failing to capture the true\nreasoning of models. We introduce Adversarial Sensitivity as a novel approach\nto faithfulness evaluation, focusing on the explainer's response when the model\nis under adversarial attack. Our method accounts for the faithfulness of\nexplainers by capturing sensitivity to adversarial input changes. This work\naddresses significant limitations in existing evaluation techniques, and\nfurthermore, quantifies faithfulness from a crucial yet underexplored paradigm.\n","authors":["Supriya Manna","Niladri Sett"],"pdf_url":"https://arxiv.org/pdf/2409.17774v1.pdf","comment":"Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP"},{"id":"http://arxiv.org/abs/2409.13832v2","updated":"2024-09-26T12:07:20Z","published":"2024-09-20T18:18:14Z","title":"GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music\n Scores for All Singing Tasks","summary":" The scarcity of high-quality and multi-task singing datasets significantly\nhinders the development of diverse controllable and personalized singing tasks,\nas existing singing datasets suffer from low quality, limited diversity of\nlanguages and singers, absence of multi-technique information and realistic\nmusic scores, and poor task suitability. To tackle these problems, we present\nGTSinger, a large global, multi-technique, free-to-use, high-quality singing\ncorpus with realistic music scores, designed for all singing tasks, along with\nits benchmarks. Particularly, (1) we collect 80.59 hours of high-quality\nsinging voices, forming the largest recorded singing dataset; (2) 20\nprofessional singers across nine widely spoken languages offer diverse timbres\nand styles; (3) we provide controlled comparison and phoneme-level annotations\nof six commonly used singing techniques, helping technique modeling and\ncontrol; (4) GTSinger offers realistic music scores, assisting real-world\nmusical composition; (5) singing voices are accompanied by manual\nphoneme-to-audio alignments, global style labels, and 16.16 hours of paired\nspeech for various singing tasks. Moreover, to facilitate the use of GTSinger,\nwe conduct four benchmark experiments: technique-controllable singing voice\nsynthesis, technique recognition, style transfer, and speech-to-singing\nconversion. The corpus and demos can be found at http://gtsinger.github.io. We\nprovide the dataset and the code for processing data and conducting benchmarks\nat https://huggingface.co/datasets/GTSinger/GTSinger and\nhttps://github.com/GTSinger/GTSinger.\n","authors":["Yu Zhang","Changhao Pan","Wenxiang Guo","Ruiqi Li","Zhiyuan Zhu","Jialei Wang","Wenhao Xu","Jingyu Lu","Zhiqing Hong","Chuxin Wang","LiChao Zhang","Jinzheng He","Ziyue Jiang","Yuxin Chen","Chen Yang","Jiecheng Zhou","Xinyu Cheng","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.13832v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2409.17757v1","updated":"2024-09-26T11:46:58Z","published":"2024-09-26T11:46:58Z","title":"Integrating Hierarchical Semantic into Iterative Generation Model for\n Entailment Tree Explanation","summary":" Manifestly and logically displaying the line of reasoning from evidence to\nanswer is significant to explainable question answering (QA). The entailment\ntree exhibits the lines structurally, which is different from the\nself-explanation principle in large-scale language models. Existing methods\nrarely consider the semantic association of sentences between and within\nhierarchies within the tree structure, which is prone to apparent mistakes in\ncombinations. In this work, we propose an architecture of integrating the\nHierarchical Semantics of sentences under the framework of Controller-Generator\n(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between\nhypotheses and facts, discriminates the facts involved in tree constructions,\nand optimizes single-step entailments. To the best of our knowledge, We are the\nfirst to notice hierarchical semantics of sentences between the same layer and\nadjacent layers to yield improvements. The proposed method achieves comparable\nperformance on all three settings of the EntailmentBank dataset. The\ngeneralization results on two out-of-domain datasets also demonstrate the\neffectiveness of our method.\n","authors":["Qin Wang","Jianzhou Feng","Yiming Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04259v2","updated":"2024-09-26T11:42:35Z","published":"2024-08-08T06:57:49Z","title":"EfficientRAG: Efficient Retriever for Multi-Hop Question Answering","summary":" Retrieval-augmented generation (RAG) methods encounter difficulties when\naddressing complex questions like multi-hop queries. While iterative retrieval\nmethods improve performance by gathering additional information, current\napproaches often rely on multiple calls of large language models (LLMs). In\nthis paper, we introduce EfficientRAG, an efficient retriever for multi-hop\nquestion answering. EfficientRAG iteratively generates new queries without the\nneed for LLM calls at each iteration and filters out irrelevant information.\nExperimental results demonstrate that EfficientRAG surpasses existing RAG\nmethods on three open-domain multi-hop question-answering datasets.\n","authors":["Ziyuan Zhuang","Zhiyang Zhang","Sitao Cheng","Fangkai Yang","Jia Liu","Shujian Huang","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.04259v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17755v1","updated":"2024-09-26T11:40:07Z","published":"2024-09-26T11:40:07Z","title":"SECURE: Semantics-aware Embodied Conversation under Unawareness for\n Lifelong Robot Learning","summary":" This paper addresses a challenging interactive task learning scenario we call\nrearrangement under unawareness: to manipulate a rigid-body environment in a\ncontext where the robot is unaware of a concept that's key to solving the\ninstructed task. We propose SECURE, an interactive task learning framework\ndesigned to solve such problems by fixing a deficient domain model using\nembodied conversation. Through dialogue, the robot discovers and then learns to\nexploit unforeseen possibilities. Using SECURE, the robot not only learns from\nthe user's corrective feedback when it makes a mistake, but it also learns to\nmake strategic dialogue decisions for revealing useful evidence about novel\nconcepts for solving the instructed task. Together, these abilities allow the\nrobot to generalise to subsequent tasks using newly acquired knowledge. We\ndemonstrate that a robot that is semantics-aware -- that is, it exploits the\nlogical consequences of both sentence and discourse semantics in the learning\nand inference process -- learns to solve rearrangement under unawareness more\neffectively than a robot that lacks such capabilities.\n","authors":["Rimvydas Rubavicius","Peter David Fagan","Alex Lascarides","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2409.17755v1.pdf","comment":"10 pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.17750v1","updated":"2024-09-26T11:31:18Z","published":"2024-09-26T11:31:18Z","title":"Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical\n Study","summary":" In this study, we delve into the efficacy of transformers within pre-trained\nlanguage models (PLMs) when repurposed as encoders for Automatic Speech\nRecognition (ASR). Our underlying hypothesis posits that, despite being\ninitially trained on text-based corpora, these transformers possess a\nremarkable capacity to extract effective features from the input sequence. This\ninherent capability, we argue, is transferrable to speech data, thereby\naugmenting the acoustic modeling ability of ASR. Through rigorous empirical\nanalysis, our findings reveal a notable improvement in Character Error Rate\n(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from\npre-trained LMs are incorporated. Particularly, they serve as an advantageous\nstarting point for initializing ASR encoders. Furthermore, we uncover that\nthese transformers, when integrated into a well-established ASR encoder, can\nsignificantly boost performance, especially in scenarios where profound\nsemantic comprehension is pivotal. This underscores the potential of leveraging\nthe semantic prowess embedded within pre-trained transformers to advance ASR\nsystems' capabilities.\n","authors":["Keyu An","Shiliang Zhang","Zhijie Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17750v1.pdf","comment":"8pages"},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2402.10712v3","updated":"2024-09-26T11:15:14Z","published":"2024-02-16T14:15:15Z","title":"An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient\n Language Model Inference","summary":" The development of state-of-the-art generative large language models (LLMs)\ndisproportionately relies on English-centric tokenizers, vocabulary and\npre-training data. Despite the fact that some LLMs have multilingual\ncapabilities, recent studies have shown that their inference efficiency\ndeteriorates when generating text in languages other than English. This results\nin increased inference time and costs. Cross-lingual vocabulary adaptation\n(CVA) methods have been proposed for adapting models to a target language\naiming to improve downstream performance. However, the effectiveness of these\nmethods on increasing inference efficiency of generative LLMs has yet to be\nexplored. In this paper, we perform an empirical study of five CVA methods on\nfour generative LLMs (including monolingual and multilingual models) across\nfour typologically-diverse languages and four natural language understanding\ntasks. We find that CVA substantially contributes to LLM inference speedups of\nup to 271.5\\%. We also show that adapting LLMs that have been pre-trained on\nmore balanced multilingual data results in downstream performance comparable to\nthe original models.\n","authors":["Atsuki Yamaguchi","Aline Villavicencio","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2402.10712v3.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.12442v2","updated":"2024-09-26T11:15:14Z","published":"2024-06-18T09:46:44Z","title":"Abstraction-of-Thought Makes Language Models Better Reasoners","summary":" Abstract reasoning, the ability to reason from the abstract essence of a\nproblem, serves as a key to generalization in human reasoning. However,\neliciting language models to perform reasoning with abstraction remains\nunexplored. This paper seeks to bridge this gap by introducing a novel\nstructured reasoning format called Abstraction-of-Thought (AoT). The uniqueness\nof AoT lies in its explicit requirement for varying levels of abstraction\nwithin the reasoning process. This approach could elicit language models to\nfirst contemplate on the abstract level before incorporating concrete details,\nwhich is overlooked by the prevailing step-by-step Chain-of-Thought (CoT)\nmethod. To align models with the AoT format, we present AoT Collection, a\ngeneric finetuning dataset consisting of 348k high-quality samples with AoT\nreasoning processes, collected via an automated and scalable pipeline. We\nfinetune a wide range of language models with AoT Collection and conduct\nextensive evaluations on 23 unseen tasks from the challenging benchmark\nBig-Bench Hard. Experimental results indicate that models aligned to AoT\nreasoning format substantially outperform those aligned to CoT in many\nreasoning tasks.\n","authors":["Ruixin Hong","Hongming Zhang","Xiaoman Pan","Dong Yu","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12442v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2405.14722v2","updated":"2024-09-26T10:23:33Z","published":"2024-05-23T15:51:24Z","title":"DAPE: Data-Adaptive Positional Encoding for Length Extrapolation","summary":" Positional encoding plays a crucial role in transformers, significantly\nimpacting model performance and length generalization. Prior research has\nintroduced absolute positional encoding (APE) and relative positional encoding\n(RPE) to distinguish token positions in given sequences. However, both APE and\nRPE remain fixed after model training regardless of input data, limiting their\nadaptability and flexibility. Hence, we expect that the desired positional\nencoding should be data-adaptive and can be dynamically adjusted with the given\nattention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE)\nmethod, which dynamically and semantically adjusts based on input context and\nlearned fixed priors. Experimental validation on real-world datasets (Arxiv,\nBooks3, and CHE) demonstrates that DAPE enhances model performances in terms of\ntrained length and length generalization, where the improvements are\nstatistically significant. The model visualization suggests that our model can\nkeep both local and anti-local information. Finally, we successfully train the\nmodel on sequence length 128 and achieve better performance at evaluation\nsequence length 8192, compared with other static positional encoding methods,\nrevealing the benefit of the adaptive positional encoding method.\n","authors":["Chuanyang Zheng","Yihang Gao","Han Shi","Minbin Huang","Jingyao Li","Jing Xiong","Xiaozhe Ren","Michael Ng","Xin Jiang","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2405.14722v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01432v3","updated":"2024-09-26T10:03:41Z","published":"2024-03-03T08:07:55Z","title":"Fine Tuning vs. Retrieval Augmented Generation for Less Popular\n Knowledge","summary":" Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting\nstrong performance across diverse tasks and domains. However, it has been\nobserved that the performance diminishes when dealing with less-popular or\nlow-frequency concepts and entities, for example in domain specific\napplications. The two prominent approaches to enhance the performance of LMs on\nlow-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning\n(FT) over synthetic data. This paper explores and evaluates the impact of RAG\nand FT on customizing LMs in handling low-frequency entities on question\nanswering tasks. We conduct extensive experiments on twelve LMs of varying size\nand type and different fine tuning, data augmentation, and retrieval models.\nOur findings indicate that while FT boosts the performance across entities of\nvarying popularity, RAG surpasses FT by a large margin particularly for least\npopular factual knowledge. Additionally, the success of both RAG and FT\napproaches is amplified by improving retrieval and data augmentation\ntechniques. Fine tuning, while beneficial for small LMs, requires extensive\nresources. To address this issue, we propose the new Stimulus RAG approach that\nsurpasses the effectiveness of fine tuning based approaches, thereby\neliminating the need for the costly data augmentation and fine tuning step for\nenriching LMs with less popular factual knowledge.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2403.01432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14374v2","updated":"2024-09-26T10:01:17Z","published":"2024-09-22T09:33:54Z","title":"J2N -- Nominal Adjective Identification and its Application","summary":" This paper explores the challenges posed by nominal adjectives (NAs) in\nnatural language processing (NLP) tasks, particularly in part-of-speech (POS)\ntagging. We propose treating NAs as a distinct POS tag, \"JN,\" and investigate\nits impact on POS tagging, BIO chunking, and coreference resolution. Our study\nshows that reclassifying NAs can improve the accuracy of syntactic analysis and\nstructural understanding in NLP. We present experimental results using Hidden\nMarkov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating\nthe feasibility and potential benefits of this approach. Additionally we\ntrained a bert model to identify the NA in untagged text.\n","authors":["Lemeng Qi","Yang Han","Zhuotong Xie"],"pdf_url":"https://arxiv.org/pdf/2409.14374v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2404.00459v2","updated":"2024-09-26T09:54:57Z","published":"2024-03-30T19:46:59Z","title":"NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning","summary":" Language models struggle with handling numerical data and performing\narithmetic operations. We hypothesize that this limitation can be partially\nattributed to non-intuitive textual numbers representation. When a digit is\nread or generated by a causal language model it does not know its place value\n(e.g. thousands vs. hundreds) until the entire number is processed. To address\nthis issue, we propose a simple adjustment to how numbers are represented by\nincluding the count of digits before each number. For instance, instead of\n\"42\", we suggest using \"{2:42}\" as the new format. This approach, which we term\nNumeroLogic, offers an added advantage in number generation by serving as a\nChain of Thought (CoT). By requiring the model to consider the number of digits\nfirst, it enhances the reasoning process before generating the actual number.\nWe use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic\nformatting. We further demonstrate NumeroLogic applicability to general natural\nlanguage modeling, improving language understanding performance in the MMLU\nbenchmark.\n","authors":["Eli Schwartz","Leshem Choshen","Joseph Shtok","Sivan Doveh","Leonid Karlinsky","Assaf Arbelle"],"pdf_url":"https://arxiv.org/pdf/2404.00459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06802v2","updated":"2024-09-26T09:52:20Z","published":"2024-05-10T20:29:25Z","title":"Leveraging summary of radiology reports with transformers","summary":" Two fundamental problems in health-care stem from patient handoff and triage.\nDoctors are often required to perform complex findings summarization to\nfacilitate efficient communication with specialists and decision making on the\nurgency of each case. To address these challenges, we present a state of the\nart radiology report summarization model utilizing adjusted bidirectional\nencoder representation from transformers BERTtoBERT encoder and decoder\narchitecture. We also provide a data processing pipeline for future models\ndeveloped on the the MIMIC CXR dataset. Our approach includes a novel method\nfor augmenting medical data and a comprehensive performance analysis. Our best\nperforming model achieved a recall oriented understudy for gisting evaluation L\nF1 score of 58.75/100, outperforming specialized checkpoints with more\nsophisticated attention mechanisms. We also provide a data processing pipeline\nfor future models developed on the MIMIC chest X-ray dataset. The model\nintroduced in this paper demonstrates significantly improved capacity in\nradiology report summarization, highlighting the potential for ensuring better\nclinical workflows and enhanced patient care.\n","authors":["Raul Salles de Padua","Imran Qureshi"],"pdf_url":"https://arxiv.org/pdf/2405.06802v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17683v1","updated":"2024-09-26T09:49:27Z","published":"2024-09-26T09:49:27Z","title":"Zero- and Few-shot Named Entity Recognition and Text Expansion in\n Medication Prescriptions using ChatGPT","summary":" Introduction: Medication prescriptions are often in free text and include a\nmix of two languages, local brand names, and a wide range of idiosyncratic\nformats and abbreviations. Large language models (LLMs) have shown promising\nability to generate text in response to input prompts. We use ChatGPT 3.5 to\nautomatically structure and expand medication statements in discharge summaries\nand thus make them easier to interpret for people and machines. Methods:\nNamed-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and\nfew-shot setting with different prompt strategies. 100 medication statements\nwere manually annotated and curated. NER performance was measured by using\nstrict and partial matching. For the task EX, two experts interpreted the\nresults by assessing semantic equivalence between original and expanded\nstatements. The model performance was measured by precision, recall, and F1\nscore. Results: For NER, the best-performing prompt reached an average F1 score\nof 0.94 in the test set. For EX, the few-shot prompt showed superior\nperformance among other prompts, with an average F1 score of 0.87. Conclusion:\nOur study demonstrates good performance for NER and EX tasks in free-text\nmedication statements using ChatGPT. Compared to a zero-shot baseline, a\nfew-shot approach prevented the system from hallucinating, which would be\nunacceptable when processing safety-relevant medication data.\n","authors":["Natthanaphop Isaradech","Andrea Riedel","Wachiranun Sirikul","Markus Kreuzthaler","Stefan Schulz"],"pdf_url":"https://arxiv.org/pdf/2409.17683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13167v2","updated":"2024-09-26T09:42:48Z","published":"2024-06-19T02:46:18Z","title":"QRMeM: Unleash the Length Limitation through Question then Reflection\n Memory Mechanism","summary":" While large language models (LLMs) have made notable advancements in natural\nlanguage processing, they continue to struggle with processing extensive text.\nMemory mechanism offers a flexible solution for managing long contexts,\nutilizing techniques such as compression, summarization, and structuring to\nfacilitate nuanced and efficient handling of large volumes of text. However,\nexisting techniques face challenges with static knowledge integration, leading\nto insufficient adaptation to task-specific needs and missing\nmulti-segmentation relationships, which hinders the dynamic reorganization and\nlogical combination of relevant segments during the response process. To\naddress these issues, we introduce a novel strategy, Question then Reflection\nMemory Mechanism (QRMeM), incorporating a dual-structured memory pool. This\npool synergizes static textual content with structured graph guidance,\nfostering a reflective trial-and-error approach for navigating and identifying\nrelevant segments. Our evaluation across multiple-choice questions (MCQ) and\nmulti-document question answering (Multi-doc QA) benchmarks showcases QRMeM\nenhanced performance compared to existing approaches.\n","authors":["Bo Wang","Heyan Huang","Yixin Cao","Jiahao Ying","Wei Tang","Chong Feng"],"pdf_url":"https://arxiv.org/pdf/2406.13167v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17673v1","updated":"2024-09-26T09:32:12Z","published":"2024-09-26T09:32:12Z","title":"Cross-lingual Human-Preference Alignment for Neural Machine Translation\n with Direct Quality Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) and derivative techniques\nlike Direct Preference Optimization (DPO) are task-alignment algorithms used to\nrepurpose general, foundational models for specific tasks. We show that\napplying task-alignment to neural machine translation (NMT) addresses an\nexisting task--data mismatch in NMT, leading to improvements across all\nlanguages of a multilingual model, even when task-alignment is only applied to\na subset of those languages. We do so by introducing Direct Quality\nOptimization (DQO), a variant of DPO leveraging a pre-trained translation\nquality estimation model as a proxy for human preferences, and verify the\nimprovements with both automatic metrics and human evaluation.\n","authors":["Kaden Uhlig","Joern Wuebker","Raphael Reinauer","John DeNero"],"pdf_url":"https://arxiv.org/pdf/2409.17673v1.pdf","comment":"17 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2407.16693v2","updated":"2024-09-26T09:27:30Z","published":"2024-07-23T17:56:32Z","title":"Explanation Regularisation through the Lens of Attributions","summary":" Explanation regularisation (ER) has been introduced as a way to guide text\nclassifiers to form their predictions relying on input tokens that humans\nconsider plausible. This is achieved by introducing an auxiliary explanation\nloss that measures how well the output of an input attribution technique for\nthe model agrees with human-annotated rationales. The guidance appears to\nbenefit performance in out-of-domain (OOD) settings, presumably due to an\nincreased reliance on \"plausible\" tokens. However, previous work has\nunder-explored the impact of guidance on that reliance, particularly when\nreliance is measured using attribution techniques different from those used to\nguide the model. In this work, we seek to close this gap, and also explore the\nrelationship between reliance on plausible features and OOD performance. We\nfind that the connection between ER and the ability of a classifier to rely on\nplausible features has been overstated and that a stronger reliance on\nplausible tokens does not seem to be the cause for OOD improvements.\n","authors":["Pedro Ferreira","Ivan Titov","Wilker Aziz"],"pdf_url":"https://arxiv.org/pdf/2407.16693v2.pdf","comment":"22 pages, 14 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.12753v2","updated":"2024-09-26T09:17:10Z","published":"2024-04-19T09:59:44Z","title":"AutoScraper: A Progressive Understanding Web Agent for Web Scraper\n Generation","summary":" Web scraping is a powerful technique that extracts data from websites,\nenabling automated data collection, enhancing data analysis capabilities, and\nminimizing manual data entry efforts. Existing methods, wrappers-based methods\nsuffer from limited adaptability and scalability when faced with a new website,\nwhile language agents, empowered by large language models (LLMs), exhibit poor\nreusability in diverse web environments. In this work, we introduce the\nparadigm of generating web scrapers with LLMs and propose AutoScraper, a\ntwo-stage framework that can handle diverse and changing web environments more\nefficiently. AutoScraper leverages the hierarchical structure of HTML and\nsimilarity across different web pages for generating web scrapers. Besides, we\npropose a new executability metric for better measuring the performance of web\nscraper generation tasks. We conduct comprehensive experiments with multiple\nLLMs and demonstrate the effectiveness of our framework. Resources of this\npaper can be found at \\url{https://github.com/EZ-hwh/AutoScraper}\n","authors":["Wenhao Huang","Zhouhong Gu","Chenghao Peng","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Liqian Wen","Zulong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12753v2.pdf","comment":"19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17650v1","updated":"2024-09-26T08:56:54Z","published":"2024-09-26T08:56:54Z","title":"Digital Twin Ecosystem for Oncology Clinical Operations","summary":" Artificial Intelligence (AI) and Large Language Models (LLMs) hold\nsignificant promise in revolutionizing healthcare, especially in clinical\napplications. Simultaneously, Digital Twin technology, which models and\nsimulates complex systems, has gained traction in enhancing patient care.\nHowever, despite the advances in experimental clinical settings, the potential\nof AI and digital twins to streamline clinical operations remains largely\nuntapped. This paper introduces a novel digital twin framework specifically\ndesigned to enhance oncology clinical operations. We propose the integration of\nmultiple specialized digital twins, such as the Medical Necessity Twin, Care\nNavigator Twin, and Clinical History Twin, to enhance workflow efficiency and\npersonalize care for each patient based on their unique data. Furthermore, by\nsynthesizing multiple data sources and aligning them with the National\nComprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care\nPath, a continuously evolving knowledge base that enables these digital twins\nto provide precise, tailored clinical recommendations.\n","authors":["Himanshu Pandey","Akhil Amod"," Shivang","Kshitij Jaggi","Ruchi Garg","Abheet Jain","Vinayak Tantia"],"pdf_url":"https://arxiv.org/pdf/2409.17650v1.pdf","comment":"Pre Print"},{"id":"http://arxiv.org/abs/2409.17648v1","updated":"2024-09-26T08:55:21Z","published":"2024-09-26T08:55:21Z","title":"Efficient In-Domain Question Answering for Resource-Constrained\n Environments","summary":" Retrieval Augmented Generation (RAG) is a common method for integrating\nexternal knowledge into pretrained Large Language Models (LLMs) to enhance\naccuracy and relevancy in question answering (QA) tasks. However, prompt\nengineering and resource efficiency remain significant bottlenecks in\ndeveloping optimal and robust RAG solutions for real-world QA applications.\nRecent studies have shown success in using fine tuning to address these\nproblems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to\nsmaller 7B models has demonstrated superior performance compared to RAG setups\nwith much larger models such as GPT-3.5. The combination of RAFT with\nparameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation\n(LoRA), promises an even more efficient solution, yet remains an unexplored\narea. In this work, we combine RAFT with LoRA to reduce fine tuning and storage\nrequirements and gain faster inference times while maintaining comparable RAG\nperformance. This results in a more compute-efficient RAFT, or CRAFT, which is\nparticularly useful for knowledge-intensive QA tasks in resource-constrained\nenvironments where internet access may be restricted and hardware resources\nlimited.\n","authors":["Isaac Chung","Phat Vo","Arman Kizilkale","Aaron Reite"],"pdf_url":"https://arxiv.org/pdf/2409.17648v1.pdf","comment":"6 pages, 2 tables"},{"id":"http://arxiv.org/abs/2405.16908v2","updated":"2024-09-26T08:53:01Z","published":"2024-05-27T07:56:23Z","title":"Can Large Language Models Faithfully Express Their Intrinsic Uncertainty\n in Words?","summary":" We posit that large language models (LLMs) should be capable of expressing\ntheir intrinsic uncertainty in natural language. For example, if the LLM is\nequally likely to output two contradicting answers to the same question, then\nits generated response should reflect this uncertainty by hedging its answer\n(e.g., \"I'm not sure, but I think...\"). We formalize faithful response\nuncertainty based on the gap between the model's intrinsic confidence in the\nassertions it makes and the decisiveness by which they are conveyed. This\nexample-level metric reliably indicates whether the model reflects its\nuncertainty, as it penalizes both excessive and insufficient hedging. We\nevaluate a variety of aligned LLMs at faithfully communicating uncertainty on\nseveral knowledge-intensive question answering tasks. Our results provide\nstrong evidence that modern LLMs are poor at faithfully conveying their\nuncertainty, and that better alignment is necessary to improve their\ntrustworthiness.\n","authors":["Gal Yona","Roee Aharoni","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2405.16908v2.pdf","comment":"To appear in EMNLP 2024 (main conference)"},{"id":"http://arxiv.org/abs/2408.10902v2","updated":"2024-09-26T08:47:36Z","published":"2024-08-20T14:45:23Z","title":"Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs","summary":" Although human evaluation remains the gold standard for open-domain dialogue\nevaluation, the growing popularity of automated evaluation using Large Language\nModels (LLMs) has also extended to dialogue. However, most frameworks leverage\nbenchmarks that assess older chatbots on aspects such as fluency and relevance,\nwhich are not reflective of the challenges associated with contemporary models.\nIn fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset,\nsuggests that current chatbots may exhibit several recurring issues related to\ncoherence and commonsense knowledge, but generally produce highly fluent and\nrelevant responses.\n Noting the aforementioned limitations, this paper introduces Soda-Eval, an\nannotated dataset based on Soda that covers over 120K turn-level assessments\nacross 10K dialogues, where the annotations were generated by GPT-4. Using\nSoda-Eval as a benchmark, we then study the performance of several open-access\ninstruction-tuned LLMs, finding that dialogue evaluation remains challenging.\nFine-tuning these models improves performance over few-shot inferences, both in\nterms of correlation and explanation.\n","authors":["John Mendonça","Isabel Trancoso","Alon Lavie"],"pdf_url":"https://arxiv.org/pdf/2408.10902v2.pdf","comment":"Accepted to EMNLP2024 (findings)"},{"id":"http://arxiv.org/abs/2409.17640v1","updated":"2024-09-26T08:44:38Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n on an Assistant Task for a Target Task","summary":" Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09802v2","updated":"2024-09-26T08:15:50Z","published":"2023-11-16T11:26:21Z","title":"Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs","summary":" Two lines of approaches are adopted for complex reasoning with LLMs. One line\nof work prompts LLMs with various reasoning structures, while the structural\noutputs can be naturally regarded as intermediate reasoning steps. Another line\nof work adopt LLM-free declarative solvers to do the reasoning task, rendering\nhigher reasoning accuracy but lacking interpretability due to the black-box\nnature of the solvers. Aiming to resolve the trade-off between answer accuracy\nand interpretability, we present a simple extension to the latter line of work.\nSpecifically, we showcase that the intermediate search logs generated by Prolog\ninterpreters can be accessed and interpreted into human-readable reasoning\nproofs. As long as LLMs correctly translate problem descriptions into Prolog\nrepresentations, the corresponding reasoning proofs are ensured to be causal\nand reliable. On two logical reasoning and one arithmetic reasoning datasets,\nour framework obtains significant improvements in terms of both answer accuracy\nand reasoning proof accuracy. Our code is released at\nhttps://github.com/DAMO-NLP-SG/CaRing\n","authors":["Sen Yang","Xin Li","Leyang Cui","Lidong Bing","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2311.09802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17603v1","updated":"2024-09-26T07:40:03Z","published":"2024-09-26T07:40:03Z","title":"Deep CLAS: Deep Contextual Listen, Attend and Spell","summary":" Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech\nRecognition (ASR) of rare words. It relies on phrase-level contextual modeling\nand attention-based relevance scoring without explicit contextual constraint\nwhich lead to insufficient use of contextual information. In this work, we\npropose deep CLAS to use contextual information better. We introduce bias loss\nforcing model to focus on contextual information. The query of bias attention\nis also enriched to improve the accuracy of the bias attention score. To get\nfine-grained contextual information, we replace phrase-level encoding with\ncharacter-level encoding and encode contextual information with conformer\nrather than LSTM. Moreover, we directly use the bias attention score to correct\nthe output probability distribution of the model. Experiments using the public\nAISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS\nobtains a 65.78% relative recall and a 53.49% relative F1-score increase in the\nnamed entity recognition scene.\n","authors":["Shifu Xiong","Mengzhi Wang","Genshun Wan","Hang Chen","Jianqing Gao","Lirong Dai"],"pdf_url":"https://arxiv.org/pdf/2409.17603v1.pdf","comment":"Accepted by NCMMSC 2022"},{"id":"http://arxiv.org/abs/2409.17588v1","updated":"2024-09-26T07:07:14Z","published":"2024-09-26T07:07:14Z","title":"DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon\n Expansion of Idioms","summary":" Idioms represent a ubiquitous vehicle for conveying sentiments in the realm\nof everyday discourse, rendering the nuanced analysis of idiom sentiment\ncrucial for a comprehensive understanding of emotional expression within\nreal-world texts. Nevertheless, the existing corpora dedicated to idiom\nsentiment analysis considerably limit research in text sentiment analysis. In\nthis paper, we propose an innovative approach to automatically expand the\nsentiment lexicon for idioms, leveraging the capabilities of large language\nmodels through the application of Chain-of-Thought prompting. To demonstrate\nthe effectiveness of this approach, we integrate multiple existing resources\nand construct an emotional idiom lexicon expansion dataset (called EmoIdiomE),\nwhich encompasses a comprehensive repository of Chinese and English idioms.\nThen we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines\ninsights from linguistics and psycholinguistics, to demonstrate the\neffectiveness of using large models to automatically expand the sentiment\nlexicon for idioms. Experiments show that DualCoTs is effective in idioms\nsentiment lexicon expansion in both Chinese and English. For reproducibility,\nwe will release the data and code upon acceptance.\n","authors":["Fuqiang Niu","Minghuan Tan","Bowen Zhang","Min Yang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10267v2","updated":"2024-09-26T06:57:27Z","published":"2024-06-11T09:24:18Z","title":"Unused information in token probability distribution of generative LLM:\n improving LLM reading comprehension through calculation of expected values","summary":" LLM text decoding is key component for perceived LLM quality. We demonstrate\ntwo experiments showing that decoding methods could be improved by manipulation\nof token probabilities. First, we test few LLM on SummEval summary scoring\ndataset, to measure reading comprehension. We compare scores from greedy\ndecoding to expected values over the next token distribution. We scale logits\nby large temperature to increase the entropy of scores. This allows strong\nimprovement of performance on SummEval (in terms of correlations to human\njudgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from\n20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part\nof the gain seems related to positional bias. Secondly, we use\nprobability-based tree sampling algorithm, to examine all most probable\ngenerations for given prompt.\n","authors":["Krystian Zawistowski"],"pdf_url":"https://arxiv.org/pdf/2406.10267v2.pdf","comment":"7 pages, 1 figure, presented at FEDCSIS 2024 conference,"},{"id":"http://arxiv.org/abs/2409.17577v1","updated":"2024-09-26T06:46:53Z","published":"2024-09-26T06:46:53Z","title":"Leveraging Annotator Disagreement for Text Classification","summary":" It is common practice in text classification to only use one majority label\nfor model training even if a dataset has been annotated by multiple annotators.\nDoing so can remove valuable nuances and diverse perspectives inherent in the\nannotators' assessments. This paper proposes and compares three different\nstrategies to leverage annotator disagreement for text classification: a\nprobability-based multi-label method, an ensemble system, and instruction\ntuning. All three approaches are evaluated on the tasks of hate speech and\nabusive conversation detection, which inherently entail a high degree of\nsubjectivity. Moreover, to evaluate the effectiveness of embracing annotation\ndisagreements for model training, we conduct an online survey that compares the\nperformance of the multi-label model against a baseline model, which is trained\nwith the majority label.\n The results show that in hate speech detection, the multi-label method\noutperforms the other two approaches, while in abusive conversation detection,\ninstruction tuning achieves the best performance. The results of the survey\nalso show that the outputs from the multi-label models are considered a better\nrepresentation of the texts than the single-label model.\n","authors":["Jin Xu","Mariët Theune","Daniel Braun"],"pdf_url":"https://arxiv.org/pdf/2409.17577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02436v2","updated":"2024-09-26T06:27:08Z","published":"2024-03-04T19:33:39Z","title":"How does Architecture Influence the Base Capabilities of Pre-trained\n Language Models? A Case Study Based on FFN-Wider and MoE Transformers","summary":" Pre-trained language models have been proven to possess strong base\ncapabilities, which not only excel in in-distribution language modeling but\nalso show powerful abilities in out-of-distribution language modeling, transfer\nlearning and few-shot learning. Unlike existing work focusing on the influence\nof scale on base capabilities, our work examines the influence of architecture\non those. Specifically, our concern is: How does architecture influence the\nbase capabilities of pre-trained language models? In this work, we attempt to\nexplain and reverse the decline in base capabilities caused by the architecture\nof FFN-Wider Transformers, seeking to provide some insights. Through analysis,\nwe found the contribution ratio of Multi-Head Attention (a combination\nfunction) to pre-trained language modeling is a key factor affecting base\ncapabilities. FFN-Wider Transformers reduce the contribution ratio of this\ncombination function, leading to a decline in base capabilities. We confirmed\nthis by experiments and proposed Combination Enhanced Architecture (CEA) to\naddress the decline in base capabilities of such models. Significantly, we\nextended our explanation and CEA to Mixture of Experts (MoE) Transformers. We\nsuccessfully achieved significant improvements in base capabilities on a 14B\nparameter MoE model, demonstrating the practical application value of our work.\nThis also indicates that our analysis has a certain guiding significance for\narchitecture analysis, architecture improvement and architecture design.\n","authors":["Xin Lu","Yanyan Zhao","Bing Qin","Liangyu Huo","Qing Yang","Dongliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.02436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.17255v2","updated":"2024-09-26T06:18:44Z","published":"2024-06-25T03:45:28Z","title":"MPCODER: Multi-user Personalized Code Generator with Explicit and\n Implicit Style Representation Learning","summary":" Large Language Models (LLMs) have demonstrated great potential for assisting\ndevelopers in their daily development. However, most research focuses on\ngenerating correct code, how to use LLMs to generate personalized code has\nseldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user\nPersonalized Code Generator) to generate personalized code for multiple users.\nTo better learn coding style features, we utilize explicit coding style\nresidual learning to capture the syntax code style standards and implicit style\nlearning to capture the semantic code style conventions. We train a multi-user\nstyle adapter to better differentiate the implicit feature representations of\ndifferent users through contrastive learning, ultimately enabling personalized\ncode generation for multiple users. We further propose a novel evaluation\nmetric for estimating similarities between codes of different coding styles.\nThe experimental results show the effectiveness of our approach for this novel\ntask.\n","authors":["Zhenlong Dai","Chang Yao","WenKang Han","Ying Yuan","Zhipeng Gao","Jingyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2406.17255v2.pdf","comment":"Accepted by ACL 2024, Main Conference"},{"id":"http://arxiv.org/abs/2409.15977v2","updated":"2024-09-26T05:26:50Z","published":"2024-09-24T11:18:09Z","title":"TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and\n Multi-Level Style Control","summary":" Zero-shot singing voice synthesis (SVS) with style transfer and style control\naims to generate high-quality singing voices with unseen timbres and styles\n(including singing method, emotion, rhythm, technique, and pronunciation) from\naudio and text prompts. However, the multifaceted nature of singing styles\nposes a significant challenge for effective modeling, transfer, and control.\nFurthermore, current SVS models often fail to generate singing voices rich in\nstylistic nuances for unseen singers. To address these challenges, we introduce\nTCSinger, the first zero-shot SVS model for style transfer across cross-lingual\nspeech and singing styles, along with multi-level style control. Specifically,\nTCSinger proposes three primary modules: 1) the clustering style encoder\nemploys a clustering vector quantization model to stably condense style\ninformation into a compact latent space; 2) the Style and Duration Language\nModel (S\\&D-LM) concurrently predicts style information and phoneme duration,\nwhich benefits both; 3) the style adaptive decoder uses a novel mel-style\nadaptive normalization method to generate singing voices with enhanced details.\nExperimental results show that TCSinger outperforms all baseline models in\nsynthesis quality, singer similarity, and style controllability across various\ntasks, including zero-shot style transfer, multi-level style control,\ncross-lingual style transfer, and speech-to-singing style transfer. Singing\nvoice samples can be accessed at https://tcsinger.github.io/.\n","authors":["Yu Zhang","Ziyue Jiang","Ruiqi Li","Changhao Pan","Jinzheng He","Rongjie Huang","Chuxin Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.15977v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17539v1","updated":"2024-09-26T04:59:45Z","published":"2024-09-26T04:59:45Z","title":"Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in\n Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious tasks but their performance in complex logical reasoning tasks remains\nunsatisfactory. Although some prompting methods, such as Chain-of-Thought, can\nimprove the reasoning ability of LLMs to some extent, they suffer from an\nunfaithful issue where derived conclusions may not align with the generated\nreasoning chain. To address this issue, some studies employ the approach of\npropositional logic to further enhance logical reasoning abilities of LLMs.\nHowever, the potential omissions in the extraction of logical expressions in\nthese methods can cause information loss in the logical reasoning process,\nthereby generating incorrect results. To this end, we propose Logic-of-Thought\n(LoT) prompting which employs propositional logic to generate expanded logical\ninformation from input context, and utilizes the generated logical information\nas an additional augmentation to the input prompts, thereby enhancing the\ncapability of logical reasoning. The LoT is orthogonal to existing prompting\nmethods and can be seamlessly integrated with them. Extensive experiments\ndemonstrate that LoT boosts the performance of various prompting methods with a\nstriking margin across five logical reasoning tasks. In particular, the LoT\nenhances Chain-of-Thought's performance on the ReClor dataset by +4.35%;\nmoreover, it improves Chain-of-Thought with Self-Consistency's performance on\nLogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on\nProofWriter dataset by +8%.\n","authors":["Tongxuan Liu","Wenjiang Xu","Weizhe Huang","Xingyu Wang","Jiaxing Wang","Hailong Yang","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2409.17539v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17536v1","updated":"2024-09-26T04:48:20Z","published":"2024-09-26T04:48:20Z","title":"MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion","summary":" Knowledge Graph Completion (KGC) aims to predict the missing [relation] part\nof (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods\nfocus on single features (e.g., relation types) or sub-graph aggregation.\nHowever, they do not fully explore the Knowledge Graph (KG) features and\nneglect the guidance of external semantic knowledge. To address these\nshortcomings, we propose a knowledge-aware reasoning model (MUSE), which\ndesigns a novel multi-knowledge representation learning mechanism for missing\nrelation prediction. Our model develops a tailored embedding space through\nthree parallel components: 1) Prior Knowledge Learning for enhancing the\ntriplets' semantic representation by fine-tuning BERT; 2) Context Message\nPassing for enhancing the context messages of KG; 3) Relational Path\nAggregation for enhancing the path representation from the head entity to the\ntail entity. The experimental results show that MUSE significantly outperforms\nother baselines on four public datasets, achieving over 5.50% H@1 improvement\nand 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be\nreleased via https://github.com/SUSTech-TP/ADMA2024-MUSE.git.\n","authors":["Pengjie Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17536v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2408.05283"},{"id":"http://arxiv.org/abs/2409.17527v1","updated":"2024-09-26T04:30:32Z","published":"2024-09-26T04:30:32Z","title":"Data Proportion Detection for Optimized Data Management for Large\n Language Models","summary":" Large language models (LLMs) have demonstrated exceptional performance across\na wide range of tasks and domains, with data preparation playing a critical\nrole in achieving these results. Pre-training data typically combines\ninformation from multiple domains. To maximize performance when integrating\ndata from various domains, determining the optimal data proportion is\nessential. However, state-of-the-art (SOTA) LLMs rarely disclose details about\ntheir pre-training data, making it difficult for researchers to identify ideal\ndata proportions. In this paper, we introduce a new topic, \\textit{data\nproportion detection}, which enables the automatic estimation of pre-training\ndata proportions by analyzing the generated outputs of LLMs. We provide\nrigorous theoretical proofs, practical algorithms, and preliminary experimental\nresults for data proportion detection. Based on these findings, we offer\nvaluable insights into the challenges and future directions for effective data\nproportion detection and data management.\n","authors":["Hao Liang","Keshi Zhao","Yajie Yang","Bin Cui","Guosheng Dong","Zenan Zhou","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17525v1","updated":"2024-09-26T04:24:52Z","published":"2024-09-26T04:24:52Z","title":"When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of\n Speaker-contextualized Language Comprehension","summary":" Spoken language is often, if not always, understood in a context that\nincludes the identities of speakers. For instance, we can easily make sense of\nan utterance such as \"I'm going to have a manicure this weekend\" or \"The first\ntime I got pregnant I had a hard time\" when the utterance is spoken by a woman,\nbut it would be harder to understand when it is spoken by a man. Previous\nevent-related potential (ERP) studies have shown mixed results regarding the\nneurophysiological responses to such speaker-mismatched utterances, with some\nreporting an N400 effect and others a P600 effect. In an experiment involving\n64 participants, we showed that these different ERP effects reflect distinct\ncognitive processes employed to resolve the speaker-message mismatch. When\npossible, the message is integrated with the speaker context to arrive at an\ninterpretation, as in the case of violations of social stereotypes (e.g., men\ngetting a manicure), resulting in an N400 effect. However, when such\nintegration is impossible due to violations of biological knowledge (e.g., men\ngetting pregnant), listeners engage in an error correction process to revise\neither the perceived utterance or the speaker context, resulting in a P600\neffect. Additionally, we found that the social N400 effect decreased as a\nfunction of the listener's personality trait of openness, while the biological\nP600 effect remained robust. Our findings help to reconcile the empirical\ninconsistencies in the literature and provide a rational account of\nspeaker-contextualized language comprehension.\n","authors":["Hanlin Wu","Zhenguang G. Cai"],"pdf_url":"https://arxiv.org/pdf/2409.17525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2402.10669v5","updated":"2024-09-26T03:16:52Z","published":"2024-02-16T13:21:06Z","title":"Humans or LLMs as the Judge? A Study on Judgement Biases","summary":" Adopting human and large language models (LLM) as judges (a.k.a human- and\nLLM-as-a-judge) for evaluating the performance of LLMs has recently gained\nattention. Nonetheless, this approach concurrently introduces potential biases\nfrom human and LLMs, questioning the reliability of the evaluation results. In\nthis paper, we propose a novel framework that is free from referencing\ngroundtruth annotations for investigating Misinformation Oversight Bias, Gender\nBias, Authority Bias and Beauty Bias on LLM and human judges. We curate a\ndataset referring to the revised Bloom's Taxonomy and conduct thousands of\nevaluations. Results show that human and LLM judges are vulnerable to\nperturbations to various degrees, and that even the cutting-edge judges possess\nconsiderable biases. We further exploit these biases to conduct attacks on LLM\njudges. We hope that our work can notify the community of the bias and\nvulnerability of human- and LLM-as-a-judge, as well as the urgency of\ndeveloping robust evaluation systems.\n","authors":["Guiming Hardy Chen","Shunian Chen","Ziche Liu","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2402.10669v5.pdf","comment":"EMNLP2024"},{"id":"http://arxiv.org/abs/2409.14509v3","updated":"2024-09-26T03:15:53Z","published":"2024-09-22T16:13:00Z","title":"Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving\n Human-AI Alignment in the Writing Process through Edits","summary":" LLM-based applications are helping people write, and LLM-generated text is\nmaking its way into social media, journalism, and our classrooms. However, the\ndifferences between LLM-generated and human-written text remain unclear. To\nexplore this, we hired professional writers to edit paragraphs in several\ncreative domains. We first found these writers agree on undesirable\nidiosyncrasies in LLM-generated text, formalizing it into a seven-category\ntaxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP\ncorpus: 1,057 LLM-generated paragraphs edited by professional writers according\nto our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our\nstudy (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms\nof writing quality, revealing common limitations across model families. Third,\nwe explored automatic editing methods to improve LLM-generated text. A\nlarge-scale preference annotation confirms that although experts largely prefer\ntext edited by other experts, automatic editing methods show promise in\nimproving alignment between LLM-generated and human-written text.\n","authors":["Tuhin Chakrabarty","Philippe Laban","Chien-Sheng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.14509v3.pdf","comment":"NLP+HCI, Behavioral Science"},{"id":"http://arxiv.org/abs/2409.17481v1","updated":"2024-09-26T02:37:41Z","published":"2024-09-26T02:37:41Z","title":"MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models","summary":" Large Language Models (LLMs) are distinguished by their massive parameter\ncounts, which typically result in significant redundancy. This work introduces\nMaskLLM, a learnable pruning method that establishes Semi-structured (or\n``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during\ninference. Instead of developing a new importance criterion, MaskLLM explicitly\nmodels N:M patterns as a learnable distribution through Gumbel Softmax\nsampling. This approach facilitates end-to-end training on large-scale datasets\nand offers two notable advantages: 1) High-quality Masks - our method\neffectively scales to large datasets and learns accurate masks; 2)\nTransferability - the probabilistic modeling of mask distribution enables the\ntransfer learning of sparsity across domains or tasks. We assessed MaskLLM\nusing 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3,\nwith sizes ranging from 843M to 15B parameters, and our empirical results show\nsubstantial improvements over state-of-the-art methods. For instance, leading\napproaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to\nthe dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL\nsolely by learning the masks with frozen weights. Furthermore, MaskLLM's\nlearnable nature allows customized masks for lossless application of 2:4\nsparsity to downstream tasks or domains. Code is available at\n\\url{https://github.com/NVlabs/MaskLLM}.\n","authors":["Gongfan Fang","Hongxu Yin","Saurav Muralidharan","Greg Heinrich","Jeff Pool","Jan Kautz","Pavlo Molchanov","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17481v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17474v1","updated":"2024-09-26T02:19:13Z","published":"2024-09-26T02:19:13Z","title":"Reducing and Exploiting Data Augmentation Noise through Meta Reweighting\n Contrastive Learning for Text Classification","summary":" Data augmentation has shown its effectiveness in resolving the data-hungry\nproblem and improving model's generalization ability. However, the quality of\naugmented data can be varied, especially compared with the raw/original data.\nTo boost deep learning models' performance given augmented data/samples in text\nclassification tasks, we propose a novel framework, which leverages both meta\nlearning and contrastive learning techniques as parts of our design for\nreweighting the augmented samples and refining their feature representations\nbased on their quality. As part of the framework, we propose novel\nweight-dependent enqueue and dequeue algorithms to utilize augmented samples'\nweight/quality information effectively. Through experiments, we show that our\nframework can reasonably cooperate with existing deep learning models (e.g.,\nRoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and\nEasydata) for specific supervised learning tasks. Experiment results show that\nour framework achieves an average of 1.6%, up to 4.3% absolute improvement on\nText-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on\nRoBERTa-base encoders on seven GLUE benchmark datasets compared with the best\nbaseline. We present an indepth analysis of our framework design, revealing the\nnon-trivial contributions of our network components. Our code is publicly\navailable for better reproducibility.\n","authors":["Guanyi Mou","Yichuan Li","Kyumin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17474v1.pdf","comment":"IEEE BigData 2021"},{"id":"http://arxiv.org/abs/2409.17472v1","updated":"2024-09-26T02:16:48Z","published":"2024-09-26T02:16:48Z","title":"Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with\n Scoring-aware Multiple Rewards","summary":" Recent advances in automated essay scoring (AES) have shifted towards\nevaluating multiple traits to provide enriched feedback. Like typical AES\nsystems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure\nagreement with human raters, aligning closely with the rating schema; however,\nits non-differentiable nature prevents its direct use in neural network\ntraining. In this paper, we propose Scoring-aware Multi-reward Reinforcement\nLearning (SaMRL), which integrates actual evaluation schemes into the training\nprocess by designing QWK-based rewards with a mean-squared error penalty for\nmulti-trait AES. Existing reinforcement learning (RL) applications in AES are\nlimited to classification models despite associated performance degradation, as\nRL requires probability distributions; instead, we adopt an autoregressive\nscore generation framework to leverage token generation probabilities for\nrobust multi-trait score predictions. Empirical analyses demonstrate that SaMRL\nfacilitates model training, notably enhancing scoring of previously inferior\nprompts.\n","authors":["Heejin Do","Sangwon Ryu","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17472v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.14552v2","updated":"2024-09-26T02:02:13Z","published":"2024-09-22T18:29:10Z","title":"Unleashing the Power of Emojis in Texts via Self-supervised Graph\n Pre-Training","summary":" Emojis have gained immense popularity on social platforms, serving as a\ncommon means to supplement or replace text. However, existing data mining\napproaches generally either completely ignore or simply treat emojis as\nordinary Unicode characters, which may limit the model's ability to grasp the\nrich semantic information in emojis and the interaction between emojis and\ntexts. Thus, it is necessary to release the emoji's power in social media data\nmining. To this end, we first construct a heterogeneous graph consisting of\nthree types of nodes, i.e. post, word and emoji nodes to improve the\nrepresentation of different elements in posts. The edges are also well-defined\nto model how these three elements interact with each other. To facilitate the\nsharing of information among post, word and emoji nodes, we propose a graph\npre-train framework for text and emoji co-modeling, which contains two graph\npre-training tasks: node-level graph contrastive learning and edge-level link\nreconstruction learning. Extensive experiments on the Xiaohongshu and Twitter\ndatasets with two types of downstream tasks demonstrate that our approach\nproves significant improvement over previous strong baseline methods.\n","authors":["Zhou Zhang","Dongzeng Tan","Jiaan Wang","Yilong Chen","Jiarong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.14552v2.pdf","comment":"Accepted by EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.17467v1","updated":"2024-09-26T01:57:27Z","published":"2024-09-26T01:57:27Z","title":"What is the social benefit of hate speech detection research? A\n Systematic Review","summary":" While NLP research into hate speech detection has grown exponentially in the\nlast three decades, there has been minimal uptake or engagement from policy\nmakers and non-profit organisations. We argue the absence of ethical frameworks\nhave contributed to this rift between current practice and best practice. By\nadopting appropriate ethical frameworks, NLP researchers may enable the social\nimpact potential of hate speech research. This position paper is informed by\nreviewing forty-eight hate speech detection systems associated with\nthirty-seven publications from different venues.\n","authors":["Sidney Gig-Jan Wong"],"pdf_url":"https://arxiv.org/pdf/2409.17467v1.pdf","comment":"Accepted to the 3rd Workshop on NLP for Positive Impact"},{"id":"http://arxiv.org/abs/2409.17458v1","updated":"2024-09-26T01:24:17Z","published":"2024-09-26T01:24:17Z","title":"RED QUEEN: Safeguarding Large Language Models against Concealed\n Multi-Turn Jailbreaking","summary":" The rapid progress of Large Language Models (LLMs) has opened up new\nopportunities across various domains and applications; yet it also presents\nchallenges related to potential misuse. To mitigate such risks, red teaming has\nbeen employed as a proactive security measure to probe language models for\nharmful outputs via jailbreak attacks. However, current jailbreak attack\napproaches are single-turn with explicit malicious queries that do not fully\ncapture the complexity of real-world interactions. In reality, users can engage\nin multi-turn interactions with LLM-based chat assistants, allowing them to\nconceal their true intentions in a more covert manner. To bridge this gap, we,\nfirst, propose a new jailbreak approach, RED QUEEN ATTACK. This method\nconstructs a multi-turn scenario, concealing the malicious intent under the\nguise of preventing harm. We craft 40 scenarios that vary in turns and select\n14 harmful categories to generate 56k multi-turn attack data points. We conduct\ncomprehensive experiments on the RED QUEEN ATTACK with four representative LLM\nfamilies of different sizes. Our experiments reveal that all LLMs are\nvulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o\nand 75.4% on Llama3-70B. Further analysis reveals that larger models are more\nsusceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment\nstrategies contributing to its success. To prioritize safety, we introduce a\nstraightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs\nto effectively counter adversarial attacks. This approach reduces the attack\nsuccess rate to below 1% while maintaining the model's performance across\nstandard benchmarks. Full implementation and dataset are publicly accessible at\nhttps://github.com/kriti-hippo/red_queen.\n","authors":["Yifan Jiang","Kriti Aggarwal","Tanmay Laud","Kashif Munir","Jay Pujara","Subhabrata Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2409.17458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17455v1","updated":"2024-09-26T01:17:42Z","published":"2024-09-26T01:17:42Z","title":"Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut\n Learning in Text Classification by Language Models","summary":" Language models (LMs), despite their advances, often depend on spurious\ncorrelations, undermining their accuracy and generalizability. This study\naddresses the overlooked impact of subtler, more complex shortcuts that\ncompromise model reliability beyond oversimplified shortcuts. We introduce a\ncomprehensive benchmark that categorizes shortcuts into occurrence, style, and\nconcept, aiming to explore the nuanced ways in which these shortcuts influence\nthe performance of LMs. Through extensive experiments across traditional LMs,\nlarge language models, and state-of-the-art robust models, our research\nsystematically investigates models' resilience and susceptibilities to\nsophisticated shortcuts. Our benchmark and code can be found at:\nhttps://github.com/yuqing-zhou/shortcut-learning-in-text-classification.\n","authors":["Yuqing Zhou","Ruixiang Tang","Ziyu Yao","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17452v1","updated":"2024-09-26T01:08:09Z","published":"2024-09-26T01:08:09Z","title":"Description-based Controllable Text-to-Speech with Cross-Lingual Voice\n Control","summary":" We propose a novel description-based controllable text-to-speech (TTS) method\nwith cross-lingual control capability. To address the lack of audio-description\npaired data in the target language, we combine a TTS model trained on the\ntarget language with a description control model trained on another language,\nwhich maps input text descriptions to the conditional features of the TTS\nmodel. These two models share disentangled timbre and style representations\nbased on self-supervised learning (SSL), allowing for disentangled voice\ncontrol, such as controlling speaking styles while retaining the original\ntimbre. Furthermore, because the SSL-based timbre and style representations are\nlanguage-agnostic, combining the TTS and description control models while\nsharing the same embedding space effectively enables cross-lingual control of\nvoice characteristics. Experiments on English and Japanese TTS demonstrate that\nour method achieves high naturalness and controllability for both languages,\neven though no Japanese audio-description pairs are used.\n","authors":["Ryuichi Yamamoto","Yuma Shirahata","Masaya Kawamura","Kentaro Tachibana"],"pdf_url":"https://arxiv.org/pdf/2409.17452v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2109.04993v3","updated":"2024-09-26T00:58:15Z","published":"2021-09-04T22:48:46Z","title":"LAViTeR: Learning Aligned Visual and Textual Representations Assisted by\n Image and Caption Generation","summary":" Pre-training visual and textual representations from large-scale image-text\npairs is becoming a standard approach for many downstream vision-language\ntasks. The transformer-based models learn inter and intra-modal attention\nthrough a list of self-supervised learning tasks. This paper proposes LAViTeR,\na novel architecture for visual and textual representation learning. The main\nmodule, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks,\nGAN-based image synthesis and Image Captioning. We also propose a new\nevaluation metric measuring the similarity between the learnt visual and\ntextual embedding. The experimental results on two public datasets, CUB and\nMS-COCO, demonstrate superior visual and textual representation alignment in\nthe joint feature embedding space\n","authors":["Mohammad Abuzar Hashemi","Zhanghexuan Li","Mihir Chauhan","Yan Shen","Abhishek Satbhai","Mir Basheer Ali","Mingchen Gao","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2109.04993v3.pdf","comment":"15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine\n Vision and Image Processing Conference Proceedings, 2024"},{"id":"http://arxiv.org/abs/2409.17448v1","updated":"2024-09-26T00:54:17Z","published":"2024-09-26T00:54:17Z","title":"Enhancing Financial Sentiment Analysis with Expert-Designed Hint","summary":" This paper investigates the role of expert-designed hint in enhancing\nsentiment analysis on financial social media posts. We explore the capability\nof large language models (LLMs) to empathize with writer perspectives and\nanalyze sentiments. Our findings reveal that expert-designed hint, i.e.,\npointing out the importance of numbers, significantly improve performances\nacross various LLMs, particularly in cases requiring perspective-taking skills.\nFurther analysis on tweets containing different types of numerical data\ndemonstrates that the inclusion of expert-designed hint leads to notable\nimprovements in sentiment analysis performance, especially for tweets with\nmonetary-related numbers. Our findings contribute to the ongoing discussion on\nthe applicability of Theory of Mind in NLP and open new avenues for improving\nsentiment analysis in financial domains through the strategic use of expert\nknowledge.\n","authors":["Chung-Chi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2409.17448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00948v2","updated":"2024-09-26T00:24:25Z","published":"2024-07-01T04:07:49Z","title":"View From Above: A Framework for Evaluating Distribution Shifts in Model\n Behavior","summary":" When large language models (LLMs) are asked to perform certain tasks, how can\nwe be sure that their learned representations align with reality? We propose a\ndomain-agnostic framework for systematically evaluating distribution shifts in\nLLMs decision-making processes, where they are given control of mechanisms\ngoverned by pre-defined rules. While individual LLM actions may appear\nconsistent with expected behavior, across a large number of trials,\nstatistically significant distribution shifts can emerge. To test this, we\nconstruct a well-defined environment with known outcome logic: blackjack. In\nmore than 1,000 trials, we uncover statistically significant evidence\nsuggesting behavioral misalignment in the learned representations of LLM.\n","authors":["Tanush Chopra","Michael Li","Jacob Haimes"],"pdf_url":"https://arxiv.org/pdf/2407.00948v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.18128v1","updated":"2024-09-26T17:59:51Z","published":"2024-09-26T17:59:51Z","title":"FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity\n Refiner","summary":" Building on the success of diffusion models in visual generation, flow-based\nmodels reemerge as another prominent family of generative models that have\nachieved competitive or better performance in terms of both visual quality and\ninference speed. By learning the velocity field through flow-matching,\nflow-based models tend to produce a straighter sampling trajectory, which is\nadvantageous during the sampling process. However, unlike diffusion models for\nwhich fast samplers are well-developed, efficient sampling of flow-based\ngenerative models has been rarely explored. In this paper, we propose a\nframework called FlowTurbo to accelerate the sampling of flow-based models\nwhile still enhancing the sampling quality. Our primary observation is that the\nvelocity predictor's outputs in the flow-based models will become stable during\nthe sampling, enabling the estimation of velocity via a lightweight velocity\nrefiner. Additionally, we introduce several techniques including a pseudo\ncorrector and sample-aware compilation to further reduce inference time. Since\nFlowTurbo does not change the multi-step sampling paradigm, it can be\neffectively applied for various tasks such as image editing, inpainting, etc.\nBy integrating FlowTurbo into different flow-based models, we obtain an\nacceleration ratio of 53.1%$\\sim$58.3% on class-conditional generation and\n29.8%$\\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID\nof 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img),\nachieving the real-time image generation and establishing the new\nstate-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo.\n","authors":["Wenliang Zhao","Minglei Shi","Xumin Yu","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18128v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18127v1","updated":"2024-09-26T17:59:31Z","published":"2024-09-26T17:59:31Z","title":"EgoLM: Multi-Modal Language Model of Egocentric Motions","summary":" As the prevalence of wearable devices, learning egocentric motions becomes\nessential to develop contextual AI. In this work, we present EgoLM, a versatile\nframework that tracks and understands egocentric motions from multi-modal\ninputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich\ncontexts for the disambiguation of egomotion tracking and understanding, which\nare ill-posed under single modality conditions. To facilitate the versatile and\nmulti-modal framework, our key insight is to model the joint distribution of\negocentric motions and natural languages using large language models (LLM).\nMulti-modal sensor inputs are encoded and projected to the joint latent space\nof language models, and used to prompt motion generation or text generation for\negomotion tracking or understanding, respectively. Extensive experiments on\nlarge-scale multi-modal human motion dataset validate the effectiveness of\nEgoLM as a generalist model for universal egocentric learning.\n","authors":["Fangzhou Hong","Vladimir Guzov","Hyo Jin Kim","Yuting Ye","Richard Newcombe","Ziwei Liu","Lingni Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18127v1.pdf","comment":"Project Page: https://hongfz16.github.io/projects/EgoLM"},{"id":"http://arxiv.org/abs/2409.18125v1","updated":"2024-09-26T17:59:11Z","published":"2024-09-26T17:59:11Z","title":"LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with\n 3D-awareness","summary":" Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced\ntheir proficiency in 2D visual understanding tasks, enabling them to\neffectively process and understand images and videos. However, the development\nof LMMs with 3D-awareness for 3D scene understanding has been hindered by the\nlack of large-scale 3D vision-language datasets and powerful 3D encoders. In\nthis paper, we introduce a simple yet effective framework called LLaVA-3D.\nLeveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D\nefficiently adapts LLaVA for 3D scene understanding without compromising 2D\nunderstanding capabilities. To achieve this, we employ a simple yet effective\nrepresentation, 3D Patch, which connects 2D CLIP patch features with their\ncorresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs\nand employing joint 2D and 3D vision-language instruction tuning, we establish\na unified architecture for both 2D image understanding and 3D scene\nunderstanding. Experimental results show that LLaVA-3D converges 3.5x faster\nthan existing 3D LMMs when trained on 3D vision-language datasets. Moreover,\nLLaVA-3D not only achieves state-of-the-art performance across various 3D tasks\nbut also maintains comparable 2D image understanding and vision-language\nconversation capabilities with LLaVA.\n","authors":["Chenming Zhu","Tai Wang","Wenwei Zhang","Jiangmiao Pang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18125v1.pdf","comment":"Project page: https://zcmax.github.io/projects/LLaVA-3D/"},{"id":"http://arxiv.org/abs/2409.18124v1","updated":"2024-09-26T17:58:55Z","published":"2024-09-26T17:58:55Z","title":"Lotus: Diffusion-based Visual Foundation Model for High-quality Dense\n Prediction","summary":" Leveraging the visual priors of pre-trained text-to-image diffusion models\noffers a promising solution to enhance zero-shot generalization in dense\nprediction tasks. However, existing methods often uncritically use the original\ndiffusion formulation, which may not be optimal due to the fundamental\ndifferences between dense prediction and image generation. In this paper, we\nprovide a systemic analysis of the diffusion formulation for the dense\nprediction, focusing on both quality and efficiency. And we find that the\noriginal parameterization type for image generation, which learns to predict\nnoise, is harmful for dense prediction; the multi-step noising/denoising\ndiffusion process is also unnecessary and challenging to optimize. Based on\nthese insights, we introduce Lotus, a diffusion-based visual foundation model\nwith a simple yet effective adaptation protocol for dense prediction.\nSpecifically, Lotus is trained to directly predict annotations instead of\nnoise, thereby avoiding harmful variance. We also reformulate the diffusion\nprocess into a single-step procedure, simplifying optimization and\nsignificantly boosting inference speed. Additionally, we introduce a novel\ntuning strategy called detail preserver, which achieves more accurate and\nfine-grained predictions. Without scaling up the training data or model\ncapacity, Lotus achieves SoTA performance in zero-shot depth and normal\nestimation across various datasets. It also significantly enhances efficiency,\nbeing hundreds of times faster than most existing diffusion-based methods.\n","authors":["Jing He","Haodong Li","Wei Yin","Yixun Liang","Leheng Li","Kaiqiang Zhou","Hongbo Liu","Bingbing Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18124v1.pdf","comment":"Project page: https://lotus3d.github.io/"},{"id":"http://arxiv.org/abs/2409.18121v1","updated":"2024-09-26T17:57:16Z","published":"2024-09-26T17:57:16Z","title":"Robot See Robot Do: Imitating Articulated Object Manipulation with\n Monocular 4D Reconstruction","summary":" Humans can learn to manipulate new objects by simply watching others;\nproviding robots with the ability to learn from such demonstrations would\nenable a natural interface specifying new behaviors. This work develops Robot\nSee Robot Do (RSRD), a method for imitating articulated object manipulation\nfrom a single monocular RGB human demonstration given a single static\nmulti-view object scan. We first propose 4D Differentiable Part Models\n(4D-DPM), a method for recovering 3D part motion from a monocular video with\ndifferentiable rendering. This analysis-by-synthesis approach uses part-centric\nfeature fields in an iterative optimization which enables the use of geometric\nregularizers to recover 3D motions from only a single video. Given this 4D\nreconstruction, the robot replicates object trajectories by planning bimanual\narm motions that induce the demonstrated object part motion. By representing\ndemonstrations as part-centric trajectories, RSRD focuses on replicating the\ndemonstration's intended behavior while considering the robot's own\nmorphological limits, rather than attempting to reproduce the hand's motion. We\nevaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part\ntrajectories and RSRD's physical execution performance on 9 objects across 10\ntrials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of\n87% success rate, for a total end-to-end success rate of 60% across 90 trials.\nNotably, this is accomplished using only feature fields distilled from large\npretrained vision models -- without any task-specific training, fine-tuning,\ndataset collection, or annotation. Project page:\nhttps://robot-see-robot-do.github.io\n","authors":["Justin Kerr","Chung Min Kim","Mingxuan Wu","Brent Yi","Qianqian Wang","Ken Goldberg","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2409.18121v1.pdf","comment":"CoRL 2024, Project page: https://robot-see-robot-do.github.io"},{"id":"http://arxiv.org/abs/2409.18120v1","updated":"2024-09-26T17:57:15Z","published":"2024-09-26T17:57:15Z","title":"EvMAPPER: High Altitude Orthomapping with Event Cameras","summary":" Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to\ncollect images about the world below. One of the most successful applications\nof UAVs is to generate orthomosaics or orthomaps, in which a series of images\nare integrated together to develop a larger map. However, the use of CMOS-based\ncameras with global or rolling shutters mean that orthomaps are vulnerable to\nchallenging light conditions, motion blur, and high-speed motion of\nindependently moving objects under the camera. Event cameras are less sensitive\nto these issues, as their pixels are able to trigger asynchronously on\nbrightness changes. This work introduces the first orthomosaic approach using\nevent cameras. In contrast to existing methods relying only on CMOS cameras,\nour approach enables map generation even in challenging light conditions,\nincluding direct sunlight and after sunset.\n","authors":["Fernando Cladera","Kenneth Chaney","M. Ani Hsieh","Camillo J. Taylor","Vijay Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.18120v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2409.18114v1","updated":"2024-09-26T17:55:02Z","published":"2024-09-26T17:55:02Z","title":"EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation","summary":" Current auto-regressive mesh generation methods suffer from issues such as\nincompleteness, insufficient detail, and poor generalization. In this paper, we\npropose an Auto-regressive Auto-encoder (ArAE) model capable of generating\nhigh-quality 3D meshes with up to 4,000 faces at a spatial resolution of\n$512^3$. We introduce a novel mesh tokenization algorithm that efficiently\ncompresses triangular meshes into 1D token sequences, significantly enhancing\ntraining efficiency. Furthermore, our model compresses variable-length\ntriangular meshes into a fixed-length latent space, enabling training latent\ndiffusion models for better generalization. Extensive experiments demonstrate\nthe superior quality, diversity, and generalization capabilities of our model\nin both point cloud and image-conditioned mesh generation tasks.\n","authors":["Jiaxiang Tang","Zhaoshuo Li","Zekun Hao","Xian Liu","Gang Zeng","Ming-Yu Liu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18114v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/dir/edgerunner/"},{"id":"http://arxiv.org/abs/2409.18111v1","updated":"2024-09-26T17:53:04Z","published":"2024-09-26T17:53:04Z","title":"E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding","summary":" Recent advances in Video Large Language Models (Video-LLMs) have demonstrated\ntheir great potential in general-purpose video understanding. To verify the\nsignificance of these models, a number of benchmarks have been proposed to\ndiagnose their capabilities in different scenarios. However, existing\nbenchmarks merely evaluate models through video-level question-answering,\nlacking fine-grained event-level assessment and task diversity. To fill this\ngap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding\nBenchmark), a large-scale and high-quality benchmark for open-ended event-level\nvideo understanding. Categorized within a 3-level task taxonomy, E.T. Bench\nencompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length)\nunder 8 domains, providing comprehensive evaluations. We extensively evaluated\n8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that\nstate-of-the-art models for coarse-level (video-level) understanding struggle\nto solve our fine-grained tasks, e.g., grounding event-of-interests within\nvideos, largely due to the short video context length, improper time\nrepresentations, and lack of multi-event training data. Focusing on these\nissues, we further propose a strong baseline model, E.T. Chat, together with an\ninstruction-tuning dataset E.T. Instruct 164K tailored for fine-grained\nevent-level understanding. Our simple but effective solution demonstrates\nsuperior performance in multiple scenarios.\n","authors":["Ye Liu","Zongyang Ma","Zhongang Qi","Yang Wu","Ying Shan","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18111v1.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18101v1","updated":"2024-09-26T17:44:52Z","published":"2024-09-26T17:44:52Z","title":"AI-Powered Augmented Reality for Satellite Assembly, Integration and\n Test","summary":" The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is\nset to transform satellite Assembly, Integration, and Testing (AIT) processes\nby enhancing precision, minimizing human error, and improving operational\nefficiency in cleanroom environments. This paper presents a technical\ndescription of the European Space Agency's (ESA) project \"AI for AR in\nSatellite AIT,\" which combines real-time computer vision and AR systems to\nassist technicians during satellite assembly. Leveraging Microsoft HoloLens 2\nas the AR interface, the system delivers context-aware instructions and\nreal-time feedback, tackling the complexities of object recognition and 6D pose\nestimation in AIT workflows. All AI models demonstrated over 70% accuracy, with\nthe detection model exceeding 95% accuracy, indicating a high level of\nperformance and reliability. A key contribution of this work lies in the\neffective use of synthetic data for training AI models in AR applications,\naddressing the significant challenges of obtaining real-world datasets in\nhighly dynamic satellite environments, as well as the creation of the Segmented\nAnything Model for Automatic Labelling (SAMAL), which facilitates the automatic\nannotation of real data, achieving speeds up to 20 times faster than manual\nhuman annotation. The findings demonstrate the efficacy of AI-driven AR systems\nin automating critical satellite assembly tasks, setting a foundation for\nfuture innovations in the space industry.\n","authors":["Alvaro Patricio","Joao Valente","Atabak Dehban","Ines Cadilha","Daniel Reis","Rodrigo Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.18101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.18099v1","updated":"2024-09-26T17:44:20Z","published":"2024-09-26T17:44:20Z","title":"EfficientCrackNet: A Lightweight Model for Crack Segmentation","summary":" Crack detection, particularly from pavement images, presents a formidable\nchallenge in the domain of computer vision due to several inherent complexities\nsuch as intensity inhomogeneity, intricate topologies, low contrast, and noisy\nbackgrounds. Automated crack detection is crucial for maintaining the\nstructural integrity of essential infrastructures, including buildings,\npavements, and bridges. Existing lightweight methods often face challenges\nincluding computational inefficiency, complex crack patterns, and difficult\nbackgrounds, leading to inaccurate detection and impracticality for real-world\napplications. To address these limitations, we propose EfficientCrackNet, a\nlightweight hybrid model combining Convolutional Neural Networks (CNNs) and\ntransformers for precise crack segmentation. EfficientCrackNet integrates\ndepthwise separable convolutions (DSC) layers and MobileViT block to capture\nboth global and local features. The model employs an Edge Extraction Method\n(EEM) and for efficient crack edge detection without pretraining, and\nUltra-Lightweight Subspace Attention Module (ULSAM) to enhance feature\nextraction. Extensive experiments on three benchmark datasets Crack500,\nDeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior\nperformance compared to existing lightweight models, while requiring only 0.26M\nparameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance\nbetween accuracy and computational efficiency, outperforming state-of-the-art\nlightweight models, and providing a robust and adaptable solution for\nreal-world crack segmentation.\n","authors":["Abid Hasan Zim","Aquib Iqbal","Zaid Al-Huda","Asad Malik","Minoru Kuribayash"],"pdf_url":"https://arxiv.org/pdf/2409.18099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18092v1","updated":"2024-09-26T17:39:05Z","published":"2024-09-26T17:39:05Z","title":"DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion\n Probabilistic Models","summary":" Perception systems play a crucial role in autonomous driving, incorporating\nmultiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors\nare widely used to capture sparse point clouds of the vehicle's surroundings.\nHowever, such systems struggle to perceive occluded areas and gaps in the scene\ndue to the sparsity of these point clouds and their lack of semantics. To\naddress these challenges, Semantic Scene Completion (SSC) jointly predicts\nunobserved geometry and semantics in the scene given raw LiDAR measurements,\naiming for a more complete scene representation. Building on promising results\nof diffusion models in image generation and super-resolution tasks, we propose\ntheir extension to SSC by implementing the noising and denoising diffusion\nprocesses in the point and semantic spaces individually. To control the\ngeneration, we employ semantic LiDAR point clouds as conditional input and\ndesign local and global regularization losses to stabilize the denoising\nprocess. We evaluate our approach on autonomous driving datasets and our\napproach outperforms the state-of-the-art for SSC.\n","authors":["Helin Cao","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2409.18092v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.16147v2","updated":"2024-09-26T17:31:35Z","published":"2024-09-23T00:11:30Z","title":"Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with\n Enhanced Generalization and Personalization Abilities","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant\npotential for modeling 3D head avatars, providing greater flexibility than\nmesh-based methods and more efficient rendering compared to NeRF-based\napproaches. Despite these advancements, the creation of controllable 3DGS-based\nhead avatars remains time-intensive, often requiring tens of minutes to hours.\nTo expedite this process, we here introduce the ``Gaussian D\\'ej\\`a-vu\"\nframework, which first obtains a generalized model of the head avatar and then\npersonalizes the result. The generalized model is trained on large 2D\n(synthetic and real) image datasets. This model provides a well-initialized 3D\nGaussian head that is further refined using a monocular video to achieve the\npersonalized head avatar. For personalizing, we propose learnable\nexpression-aware rectification blendmaps to correct the initial 3D Gaussians,\nensuring rapid convergence without the reliance on neural networks. Experiments\ndemonstrate that the proposed method meets its objectives. It outperforms\nstate-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as\nwell as reduces training time consumption to at least a quarter of the existing\nmethods, producing the avatar in minutes.\n","authors":["Peizhi Yan","Rabab Ward","Qiang Tang","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2409.16147v2.pdf","comment":"11 pages, Accepted by WACV 2025 in Round 1"},{"id":"http://arxiv.org/abs/2409.18083v1","updated":"2024-09-26T17:26:18Z","published":"2024-09-26T17:26:18Z","title":"Stable Video Portraits","summary":" Rapid advances in the field of generative AI and text-to-image methods in\nparticular have transformed the way we interact with and perceive\ncomputer-generated imagery today. In parallel, much progress has been made in\n3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we\npresent SVP, a novel hybrid 2D/3D generation method that outputs photorealistic\nvideos of talking faces leveraging a large pre-trained text-to-image prior\n(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific\nfine-tuning of a general 2D stable diffusion model which we lift to a video\nmodel by providing temporal 3DMM sequences as conditioning and by introducing a\ntemporal denoising procedure. As an output, this model generates temporally\nsmooth imagery of a person with 3DMM-based controls, i.e., a person-specific\navatar. The facial appearance of this person-specific avatar can be edited and\nmorphed to text-defined celebrities, without any fine-tuning at test time. The\nmethod is analyzed quantitatively and qualitatively, and we show that our\nmethod outperforms state-of-the-art monocular head avatar methods.\n","authors":["Mirela Ostrek","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2409.18083v1.pdf","comment":"Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2409.18082v1","updated":"2024-09-26T17:26:16Z","published":"2024-09-26T17:26:16Z","title":"SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language\n Models for Robotic Garment Manipulation","summary":" Automating garment manipulation poses a significant challenge for assistive\nrobotics due to the diverse and deformable nature of garments. Traditional\napproaches typically require separate models for each garment type, which\nlimits scalability and adaptability. In contrast, this paper presents a unified\napproach using vision-language models (VLMs) to improve keypoint prediction\nacross various garment categories. By interpreting both visual and semantic\ninformation, our model enables robots to manage different garment states with a\nsingle model. We created a large-scale synthetic dataset using advanced\nsimulation techniques, allowing scalable training without extensive real-world\ndata. Experimental results indicate that the VLM-based method significantly\nenhances keypoint detection accuracy and task success rates, providing a more\nflexible and general solution for robotic garment manipulation. In addition,\nthis research also underscores the potential of VLMs to unify various garment\nmanipulation tasks within a single framework, paving the way for broader\napplications in home automation and assistive robotics for future.\n","authors":["Xin Li","Siyuan Huang","Qiaojun Yu","Zhengkai Jiang","Ce Hao","Yimeng Zhu","Hongsheng Li","Peng Gao","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18071v1","updated":"2024-09-26T17:18:39Z","published":"2024-09-26T17:18:39Z","title":"FreeEdit: Mask-free Reference-based Image Editing with Multi-modal\n Instruction","summary":" Introducing user-specified visual concepts in image editing is highly\npractical as these concepts convey the user's intent more precisely than\ntext-based descriptions. We propose FreeEdit, a novel approach for achieving\nsuch reference-based image editing, which can accurately reproduce the visual\nconcept from the reference image based on user-friendly language instructions.\nOur approach leverages the multi-modal instruction encoder to encode language\ninstructions to guide the editing process. This implicit way of locating the\nediting area eliminates the need for manual editing masks. To enhance the\nreconstruction of reference details, we introduce the Decoupled Residual\nReferAttention (DRRA) module. This module is designed to integrate fine-grained\nreference features extracted by a detail extractor into the image editing\nprocess in a residual way without interfering with the original self-attention.\nGiven that existing datasets are unsuitable for reference-based image editing\ntasks, particularly due to the difficulty in constructing image triplets that\ninclude a reference image, we curate a high-quality dataset, FreeBench, using a\nnewly developed twice-repainting scheme. FreeBench comprises the images before\nand after editing, detailed editing instructions, as well as a reference image\nthat maintains the identity of the edited object, encompassing tasks such as\nobject addition, replacement, and deletion. By conducting phased training on\nFreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot\nediting through convenient language instructions. We conduct extensive\nexperiments to evaluate the effectiveness of FreeEdit across multiple task\ntypes, demonstrating its superiority over existing methods. The code will be\navailable at: https://freeedit.github.io/.\n","authors":["Runze He","Kai Ma","Linjiang Huang","Shaofei Huang","Jialin Gao","Xiaoming Wei","Jiao Dai","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18071v1.pdf","comment":"14 pages, 14 figures, project website: https://freeedit.github.io/"},{"id":"http://arxiv.org/abs/2409.18057v1","updated":"2024-09-26T17:00:02Z","published":"2024-09-26T17:00:02Z","title":"LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field","summary":" Recent works have shown that neural radiance fields (NeRFs) on top of\nparametric models have reached SOTA quality to build photorealistic head\navatars from a monocular video. However, one major limitation of the NeRF-based\navatars is the slow rendering speed due to the dense point sampling of NeRF,\npreventing them from broader utility on resource-constrained devices. We\nintroduce LightAvatar, the first head avatar model based on neural light fields\n(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose\nvia a single network forward pass, without using mesh or volume rendering. The\nproposed approach, while being conceptually appealing, poses a significant\nchallenge towards real-time efficiency and training stability. To resolve them,\nwe introduce dedicated network designs to obtain proper representations for the\nNeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a\ndistillation-based training strategy that uses a pretrained avatar model as\nteacher to synthesize abundant pseudo data for training. A warping field\nnetwork is introduced to correct the fitting error in the real data so that the\nmodel can learn better. Extensive experiments suggest that our method can\nachieve new SOTA image quality quantitatively or qualitatively, while being\nsignificantly faster than the counterparts, reporting 174.1 FPS (512x512\nresolution) on a consumer-grade GPU (RTX3090) with no customized optimization.\n","authors":["Huan Wang","Feitong Tan","Ziqian Bai","Yinda Zhang","Shichen Liu","Qiangeng Xu","Menglei Chai","Anish Prabhu","Rohit Pandey","Sean Fanello","Zeng Huang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2409.18057v1.pdf","comment":"Appear in ECCV'24 CADL Workshop. Code:\n https://github.com/MingSun-Tse/LightAvatar-TensorFlow"},{"id":"http://arxiv.org/abs/2409.18055v1","updated":"2024-09-26T16:59:01Z","published":"2024-09-26T16:59:01Z","title":"Visual Data Diagnosis and Debiasing with Concept Graphs","summary":" The widespread success of deep learning models today is owed to the curation\nof extensive datasets significant in size and complexity. However, such models\nfrequently pick up inherent biases in the data during the training process,\nleading to unreliable predictions. Diagnosing and debiasing datasets is thus a\nnecessity to ensure reliable model performance. In this paper, we present\nCONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence\nBiases in visual datasets. CONBIAS represents visual datasets as knowledge\ngraphs of concepts, enabling meticulous analysis of spurious concept\nco-occurrences to uncover concept imbalances across the whole dataset.\nMoreover, we show that by employing a novel clique-based concept balancing\nstrategy, we can mitigate these imbalances, leading to enhanced performance on\ndownstream tasks. Extensive experiments show that data augmentation based on a\nbalanced concept distribution augmented by CONBIAS improves generalization\nperformance across multiple datasets compared to state-of-the-art methods. We\nwill make our code and data publicly available.\n","authors":["Rwiddhi Chakraborty","Yinong Wang","Jialu Gao","Runkai Zheng","Cheng Zhang","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2409.18055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08168v3","updated":"2024-09-26T16:51:37Z","published":"2023-12-13T14:27:45Z","title":"Chat-Scene: Bridging 3D Scene and Large Language Models with Object\n Identifiers","summary":" Recent advancements in 3D Large Language Models (LLMs) have demonstrated\npromising capabilities for 3D scene understanding. However, previous methods\nexhibit deficiencies in general referencing and grounding capabilities for\nintricate scene comprehension. In this paper, we introduce the use of object\nidentifiers and object-centric representations to interact with scenes at the\nobject level. Specifically, we decompose the input 3D scene into a set of\nobject proposals, each assigned a unique identifier token, which enables\nefficient object referencing and grounding during user-assistant interactions.\nGiven the scarcity of scene-language data, we model the scene embeddings as a\nsequence of explicit object-level embeddings, derived from semantic-rich 2D or\n3D representations. By employing object identifiers, we transform diverse 3D\nscene-language tasks into a unified question-answering format, facilitating\njoint training without the need for additional task-specific heads. With\nminimal fine-tuning on all downstream tasks, our model significantly\noutperforms existing methods on benchmarks including ScanRefer, Multi3DRefer,\nScan2Cap, ScanQA, and SQA3D.\n","authors":["Haifeng Huang","Yilun Chen","Zehan Wang","Rongjie Huang","Runsen Xu","Tai Wang","Luping Liu","Xize Cheng","Yang Zhao","Jiangmiao Pang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.08168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.18042v1","updated":"2024-09-26T16:44:02Z","published":"2024-09-26T16:44:02Z","title":"EMOVA: Empowering Language Models to See, Hear and Speak with Vivid\n Emotions","summary":" GPT-4o, an omni-modal model that enables vocal conversations with diverse\nemotions and tones, marks a milestone for omni-modal foundation models.\nHowever, empowering Large Language Models to perceive and generate images,\ntexts, and speeches end-to-end with publicly available data remains challenging\nin the open-source community. Existing vision-language models rely on external\ntools for the speech processing, while speech-language models still suffer from\nlimited or even without vision-understanding abilities. To address this gap, we\npropose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large\nLanguage Models with end-to-end speech capabilities while maintaining the\nleading vision-language performance. With a semantic-acoustic disentangled\nspeech tokenizer, we notice surprisingly that omni-modal alignment can further\nenhance vision-language and speech abilities compared with the corresponding\nbi-modal aligned counterparts. Moreover, a lightweight style module is proposed\nfor flexible speech style controls (e.g., emotions and pitches). For the first\ntime, EMOVA achieves state-of-the-art performance on both the vision-language\nand speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue\nwith vivid emotions.\n","authors":["Kai Chen","Yunhao Gou","Runhui Huang","Zhili Liu","Daxin Tan","Jing Xu","Chunwei Wang","Yi Zhu","Yihan Zeng","Kuo Yang","Dingdong Wang","Kun Xiang","Haoyuan Li","Haoli Bai","Jianhua Han","Xiaohui Li","Weike Jin","Nian Xie","Yu Zhang","James T. Kwok","Hengshuang Zhao","Xiaodan Liang","Dit-Yan Yeung","Xiao Chen","Zhenguo Li","Wei Zhang","Qun Liu","Lanqing Hong","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.18042v1.pdf","comment":"Project Page: https://emova-ollm.github.io/"},{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.18026v1","updated":"2024-09-26T16:33:16Z","published":"2024-09-26T16:33:16Z","title":"ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty\n Learning","summary":" Vision-centric semantic occupancy prediction plays a crucial role in\nautonomous driving, which requires accurate and reliable predictions from\nlow-cost sensors. Although having notably narrowed the accuracy gap with LiDAR,\nthere is still few research effort to explore the reliability in predicting\nsemantic occupancy from camera. In this paper, we conduct a comprehensive\nevaluation of existing semantic occupancy prediction models from a reliability\nperspective for the first time. Despite the gradual alignment of camera-based\nmodels with LiDAR in term of accuracy, a significant reliability gap persists.\nTo addresses this concern, we propose ReliOcc, a method designed to enhance the\nreliability of camera-based occupancy networks. ReliOcc provides a\nplug-and-play scheme for existing models, which integrates hybrid uncertainty\nfrom individual voxels with sampling-based noise and relative voxels through\nmix-up learning. Besides, an uncertainty-aware calibration strategy is devised\nto further enhance model reliability in offline mode. Extensive experiments\nunder various settings demonstrate that ReliOcc significantly enhances model\nreliability while maintaining the accuracy of both geometric and semantic\npredictions. Importantly, our proposed approach exhibits robustness to sensor\nfailures and out of domain noises during inference.\n","authors":["Song Wang","Zhongdao Wang","Jiawei Yu","Wentong Li","Bailan Feng","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.18026v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2409.18017v1","updated":"2024-09-26T16:25:48Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14579v2","updated":"2024-09-26T16:25:33Z","published":"2023-12-22T10:15:15Z","title":"Synthesizing Environment-Specific People in Photographs","summary":" We present ESP, a novel method for context-aware full-body generation, that\nenables photo-realistic synthesis and inpainting of people wearing clothing\nthat is semantically appropriate for the scene depicted in an input photograph.\nESP is conditioned on a 2D pose and contextual cues that are extracted from the\nphotograph of the scene and integrated into the generation process, where the\nclothing is modeled explicitly with human parsing masks (HPM). Generated HPMs\nare used as tight guiding masks for inpainting, such that no changes are made\nto the original background. Our models are trained on a dataset containing a\nset of in-the-wild photographs of people covering a wide range of different\nenvironments. The method is analyzed quantitatively and qualitatively, and we\nshow that ESP outperforms the state-of-the-art on the task of contextual\nfull-body generation.\n","authors":["Mirela Ostrek","Carol O'Sullivan","Michael J. Black","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2312.14579v2.pdf","comment":"Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2406.08113v3","updated":"2024-09-26T16:14:54Z","published":"2024-06-12T11:50:51Z","title":"Valeo4Cast: A Modular Approach to End-to-End Forecasting","summary":" Motion forecasting is crucial in autonomous driving systems to anticipate the\nfuture trajectories of surrounding agents such as pedestrians, vehicles, and\ntraffic signals. In end-to-end forecasting, the model must jointly detect and\ntrack from sensor data (cameras or LiDARs) the past trajectories of the\ndifferent elements of the scene and predict their future locations. We depart\nfrom the current trend of tackling this task via end-to-end training from\nperception to forecasting, and instead use a modular approach. We individually\nbuild and train detection, tracking and forecasting modules. We then only use\nconsecutive finetuning steps to integrate the modules better and alleviate\ncompounding errors. We conduct an in-depth study on the finetuning strategies\nand it reveals that our simple yet effective approach significantly improves\nperformance on the end-to-end forecasting benchmark. Consequently, our solution\nranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82\nmAPf. We surpass forecasting results by +17.1 points over last year's winner\nand by +13.3 points over this year's runner-up. This remarkable performance in\nforecasting can be explained by our modular paradigm, which integrates\nfinetuning strategies and significantly outperforms the end-to-end-trained\ncounterparts. The code, model weights and results are made available\nhttps://github.com/valeoai/valeo4cast.\n","authors":["Yihong Xu","Éloi Zablocki","Alexandre Boulch","Gilles Puy","Mickael Chen","Florent Bartoccioni","Nermin Samet","Oriane Siméoni","Spyros Gidaris","Tuan-Hung Vu","Andrei Bursuc","Eduardo Valle","Renaud Marlet","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08113v3.pdf","comment":"Winning solution of the Argoverse 2 \"Unified Detection, Tracking, and\n Forecasting\" challenge; work accepted at Road++ ECCVW 2024"},{"id":"http://arxiv.org/abs/2312.05295v2","updated":"2024-09-26T16:11:37Z","published":"2023-12-08T18:43:12Z","title":"Disentangled Clothed Avatar Generation from Text Descriptions","summary":" In this paper, we introduce a novel text-to-avatar generation method that\nseparately generates the human body and the clothes and allows high-quality\nanimation on the generated avatar. While recent advancements in text-to-avatar\ngeneration have yielded diverse human avatars from text prompts, these methods\ntypically combine all elements-clothes, hair, and body-into a single 3D\nrepresentation. Such an entangled approach poses challenges for downstream\ntasks like editing or animation. To overcome these limitations, we propose a\nnovel disentangled 3D avatar representation named Sequentially Offset-SMPL\n(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and\nclothes with two separate meshes but associates them with offsets to ensure the\nphysical alignment between the body and the clothes. Then, we design a Score\nDistillation Sampling (SDS)-based distillation framework to generate the\nproposed SO-SMPL representation from text prompts. Our approach not only\nachieves higher texture and geometry quality and better semantic alignment with\ntext prompts, but also significantly improves the visual quality of character\nanimation, virtual try-on, and avatar editing. Project page:\nhttps://shanemankiw.github.io/SO-SMPL/.\n","authors":["Jionghao Wang","Yuan Liu","Zhiyang Dou","Zhengming Yu","Yongqing Liang","Cheng Lin","Xin Li","Wenping Wang","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2312.05295v2.pdf","comment":"Project page: https://shanemankiw.github.io/SO-SMPL/"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17993v1","updated":"2024-09-26T16:04:31Z","published":"2024-09-26T16:04:31Z","title":"InterNet: Unsupervised Cross-modal Homography Estimation Based on\n Interleaved Modality Transfer and Self-supervised Homography Prediction","summary":" We propose a novel unsupervised cross-modal homography estimation framework,\nbased on interleaved modality transfer and self-supervised homography\nprediction, named InterNet. InterNet integrates modality transfer and\nself-supervised homography estimation, introducing an innovative interleaved\noptimization framework to alternately promote both components. The modality\ntransfer gradually narrows the modality gaps, facilitating the self-supervised\nhomography estimation to fully leverage the synthetic intra-modal data. The\nself-supervised homography estimation progressively achieves reliable\npredictions, thereby providing robust cross-modal supervision for the modality\ntransfer. To further boost the estimation accuracy, we also formulate a\nfine-grained homography feature loss to improve the connection between two\ncomponents. Furthermore, we employ a simple yet effective distillation training\ntechnique to reduce model parameters and improve cross-domain generalization\nability while maintaining comparable performance. Experiments reveal that\nInterNet achieves the state-of-the-art (SOTA) performance among unsupervised\nmethods, and even outperforms many supervised methods such as MHN and\nLocalTrans.\n","authors":["Junchen Yu","Si-Yuan Cao","Runmin Zhang","Chenghao Zhang","Jianxin Hu","Zhu Yu","Hui-liang Shen"],"pdf_url":"https://arxiv.org/pdf/2409.17993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17988v1","updated":"2024-09-26T15:57:20Z","published":"2024-09-26T15:57:20Z","title":"Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or\n Low-light Conditions","summary":" The stark contrast in the design philosophy of an event camera makes it\nparticularly ideal for operating under high-speed, high dynamic range and\nlow-light conditions, where standard cameras underperform. Nonetheless, event\ncameras still suffer from some amount of motion blur, especially under these\nchallenging conditions, in contrary to what most think. This is attributed to\nthe limited bandwidth of the event sensor pixel, which is mostly proportional\nto the light intensity. Thus, to ensure that event cameras can truly excel in\nsuch conditions where it has an edge over standard cameras, it is crucial to\naccount for event motion blur in downstream applications, especially\nreconstruction. However, none of the recent works on reconstructing Neural\nRadiance Fields (NeRFs) from events, nor event simulators, have considered the\nfull effects of event motion blur. To this end, we propose, Deblur e-NeRF, a\nnovel method to directly and effectively reconstruct blur-minimal NeRFs from\nmotion-blurred events generated under high-speed motion or low-light\nconditions. The core component of this work is a physically-accurate pixel\nbandwidth model proposed to account for event motion blur under arbitrary speed\nand lighting conditions. We also introduce a novel threshold-normalized total\nvariation loss to improve the regularization of large textureless patches.\nExperiments on real and novel realistically simulated sequences verify our\neffectiveness. Our code, event simulator and synthetic event dataset will be\nopen-sourced.\n","authors":["Weng Fei Low","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2409.17988v1.pdf","comment":"Accepted to ECCV 2024. Project website is accessible at\n https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with\n arXiv:2006.07722 by other authors"},{"id":"http://arxiv.org/abs/2409.17987v1","updated":"2024-09-26T15:57:08Z","published":"2024-09-26T15:57:08Z","title":"LLM4Brain: Training a Large Language Model for Brain Video Understanding","summary":" Decoding visual-semantic information from brain signals, such as functional\nMRI (fMRI), across different subjects poses significant challenges, including\nlow signal-to-noise ratio, limited data availability, and cross-subject\nvariability. Recent advancements in large language models (LLMs) show\nremarkable effectiveness in processing multimodal information. In this study,\nwe introduce an LLM-based approach for reconstructing visual-semantic\ninformation from fMRI signals elicited by video stimuli. Specifically, we\nemploy fine-tuning techniques on an fMRI encoder equipped with adaptors to\ntransform brain responses into latent representations aligned with the video\nstimuli. Subsequently, these representations are mapped to textual modality by\nLLM. In particular, we integrate self-supervised domain adaptation methods to\nenhance the alignment between visual-semantic information and brain responses.\nOur proposed method achieves good results using various quantitative semantic\nmetrics, while yielding similarity with ground-truth information.\n","authors":["Ruizhe Zheng","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17987v1.pdf","comment":"ECCV2024 Workshop"},{"id":"http://arxiv.org/abs/2409.17981v1","updated":"2024-09-26T15:54:18Z","published":"2024-09-26T15:54:18Z","title":"BlinkTrack: Feature Tracking over 100 FPS via Events and Images","summary":" Feature tracking is crucial for, structure from motion (SFM), simultaneous\nlocalization and mapping (SLAM), object tracking and various computer vision\ntasks. Event cameras, known for their high temporal resolution and ability to\ncapture asynchronous changes, have gained significant attention for their\npotential in feature tracking, especially in challenging conditions. However,\nevent cameras lack the fine-grained texture information that conventional\ncameras provide, leading to error accumulation in tracking. To address this, we\npropose a novel framework, BlinkTrack, which integrates event data with RGB\nimages for high-frequency feature tracking. Our method extends the traditional\nKalman filter into a learning-based framework, utilizing differentiable Kalman\nfilters in both event and image branches. This approach improves\nsingle-modality tracking, resolves ambiguities, and supports asynchronous data\nfusion. We also introduce new synthetic and augmented datasets to better\nevaluate our model. Experimental results indicate that BlinkTrack significantly\noutperforms existing event-based methods, exceeding 100 FPS with preprocessed\nevent data and 80 FPS with multi-modality data.\n","authors":["Yichen Shen","Yijin Li","Shuo Chen","Guanglin Li","Zhaoyang Huang","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17977v1","updated":"2024-09-26T15:52:34Z","published":"2024-09-26T15:52:34Z","title":"Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform\n Optimization","summary":" In recent years, despite significant advancements in adversarial attack\nresearch, the security challenges in cross-modal scenarios, such as the\ntransferability of adversarial attacks between infrared, thermal, and RGB\nimages, have been overlooked. These heterogeneous image modalities collected by\ndifferent hardware devices are widely prevalent in practical applications, and\nthe substantial differences between modalities pose significant challenges to\nattack transferability. In this work, we explore a novel cross-modal\nadversarial attack strategy, termed multiform attack. We propose a dual-layer\noptimization framework based on gradient-evolution, facilitating efficient\nperturbation transfer between modalities. In the first layer of optimization,\nthe framework utilizes image gradients to learn universal perturbations within\neach modality and employs evolutionary algorithms to search for shared\nperturbations with transferability across different modalities through\nsecondary optimization. Through extensive testing on multiple heterogeneous\ndatasets, we demonstrate the superiority and robustness of Multiform Attack\ncompared to existing techniques. This work not only enhances the\ntransferability of cross-modal adversarial attacks but also provides a new\nperspective for understanding security vulnerabilities in cross-modal systems.\n","authors":["Yunpeng Gong","Qingyuan Zeng","Dejun Xu","Zhenzhong Wang","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.17977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17963v1","updated":"2024-09-26T15:41:18Z","published":"2024-09-26T15:41:18Z","title":"CNCA: Toward Customizable and Natural Generation of Adversarial\n Camouflage for Vehicle Detectors","summary":" Prior works on physical adversarial camouflage against vehicle detectors\nmainly focus on the effectiveness and robustness of the attack. The current\nmost successful methods optimize 3D vehicle texture at a pixel level. However,\nthis results in conspicuous and attention-grabbing patterns in the generated\ncamouflage, which humans can easily identify. To address this issue, we propose\na Customizable and Natural Camouflage Attack (CNCA) method by leveraging an\noff-the-shelf pre-trained diffusion model. By sampling the optimal texture\nimage from the diffusion model with a user-specific text prompt, our method can\ngenerate natural and customizable adversarial camouflage while maintaining high\nattack performance. With extensive experiments on the digital and physical\nworlds and user studies, the results demonstrate that our proposed method can\ngenerate significantly more natural-looking camouflage than the\nstate-of-the-art baselines while achieving competitive attack performance. Our\ncode is available at\n\\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54}\n","authors":["Linye Lyu","Jiawei Zhou","Daojing He","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2409.17963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10814v3","updated":"2024-09-26T15:37:58Z","published":"2023-08-21T16:03:35Z","title":"Jumping through Local Minima: Quantization in the Loss Landscape of\n Vision Transformers","summary":" Quantization scale and bit-width are the most important parameters when\nconsidering how to quantize a neural network. Prior work focuses on optimizing\nquantization scales in a global manner through gradient methods (gradient\ndescent \\& Hessian analysis). Yet, when applying perturbations to quantization\nscales, we observe a very jagged, highly non-smooth test loss landscape. In\nfact, small perturbations in quantization scale can greatly affect accuracy,\nyielding a $0.5-0.8\\%$ accuracy boost in 4-bit quantized vision transformers\n(ViTs). In this regime, gradient methods break down, since they cannot reliably\nreach local minima. In our work, dubbed Evol-Q, we use evolutionary search to\neffectively traverse the non-smooth landscape. Additionally, we propose using\nan infoNCE loss, which not only helps combat overfitting on the small\ncalibration dataset ($1,000$ images) but also makes traversing such a highly\nnon-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully\nquantized ViT-Base by $10.30\\%$, $0.78\\%$, and $0.15\\%$ for $3$-bit, $4$-bit,\nand $8$-bit weight quantization levels. Extensive experiments on a variety of\nCNN and ViT architectures further demonstrate its robustness in extreme\nquantization scenarios. Our code is available at\nhttps://github.com/enyac-group/evol-q\n","authors":["Natalia Frumkin","Dibakar Gope","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2308.10814v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.09643"},{"id":"http://arxiv.org/abs/2409.17958v1","updated":"2024-09-26T15:36:10Z","published":"2024-09-26T15:36:10Z","title":"The Hard Positive Truth about Vision-Language Compositionality","summary":" Several benchmarks have concluded that our best vision-language models (e.g.,\nCLIP) are lacking in compositionality. Given an image, these benchmarks probe a\nmodel's ability to identify its associated caption amongst a set of\ncompositional distractors. In response, a surge of recent proposals show\nimprovements by finetuning CLIP with distractors as hard negatives. Our\ninvestigations reveal that these improvements have, in fact, been significantly\noverstated -- because existing benchmarks do not probe whether finetuned\nvision-language models remain invariant to hard positives. By curating an\nevaluation dataset with 112,382 hard negatives and hard positives, we uncover\nthat including hard positives decreases CLIP's performance by 12.9%, while\nhumans perform effortlessly at 99%. CLIP finetuned with hard negatives results\nin an even larger decrease, up to 38.7%. With this finding, we then produce a\n1,775,259 image-text training set with both hard negative and hard positive\ncaptions. By training with both, we see improvements on existing benchmarks\nwhile simultaneously improving performance on hard positives, indicating a more\nrobust improvement in compositionality. Our work suggests the need for future\nresearch to rigorously test and improve CLIP's understanding of semantic\nrelationships between related \"positive\" concepts.\n","authors":["Amita Kamath","Cheng-Yu Hsieh","Kai-Wei Chang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.17958v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2312.14115v4","updated":"2024-09-26T15:30:00Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Visual Question Answering for Autonomous Driving","summary":" We introduce LingoQA, a novel dataset and benchmark for visual question\nanswering in autonomous driving. The dataset contains 28K unique short video\nscenarios, and 419K annotations. Evaluating state-of-the-art vision-language\nmodels on our benchmark shows that their performance is below human\ncapabilities, with GPT-4V responding truthfully to 59.6% of the questions\ncompared to 96.6% for humans. For evaluation, we propose a truthfulness\nclassifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation\ncoefficient to human evaluations, surpassing existing techniques like METEOR,\nBLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run\nextensive ablation studies to understand its performance. We release our\ndataset and benchmark as an evaluation platform for vision-language models in\nautonomous driving.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v4.pdf","comment":"Accepted to ECCV 2024. Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2409.17951v1","updated":"2024-09-26T15:28:25Z","published":"2024-09-26T15:28:25Z","title":"Spatial Hierarchy and Temporal Attention Guided Cross Masking for\n Self-supervised Skeleton-based Action Recognition","summary":" In self-supervised skeleton-based action recognition, the mask reconstruction\nparadigm is gaining interest in enhancing model refinement and robustness\nthrough effective masking. However, previous works primarily relied on a single\nmasking criterion, resulting in the model overfitting specific features and\noverlooking other effective information. In this paper, we introduce a\nhierarchy and attention guided cross-masking framework (HA-CM) that applies\nmasking to skeleton sequences from both spatial and temporal perspectives.\nSpecifically, in spatial graphs, we utilize hyperbolic space to maintain joint\ndistinctions and effectively preserve the hierarchical structure of\nhigh-dimensional skeletons, employing joint hierarchy as the masking criterion.\nIn temporal flows, we substitute traditional distance metrics with the global\nattention of joints for masking, addressing the convergence of distances in\nhigh-dimensional space and the lack of a global perspective. Additionally, we\nincorporate cross-contrast loss based on the cross-masking framework into the\nloss function to enhance the model's learning of instance-level features. HA-CM\nshows efficiency and universality on three public large-scale datasets, NTU-60,\nNTU-120, and PKU-MMD. The source code of our HA-CM is available at\nhttps://github.com/YinxPeng/HA-CM-main.\n","authors":["Xinpeng Yin","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17951v1.pdf","comment":"12 pages,6 figures,IEEE Trans"},{"id":"http://arxiv.org/abs/2409.17941v1","updated":"2024-09-26T15:16:32Z","published":"2024-09-26T15:16:32Z","title":"Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image\n Defense","summary":" Image manipulation detection and localization have received considerable\nattention from the research community given the blooming of Generative Models\n(GMs). Detection methods that follow a passive approach may overfit to specific\nGMs, limiting their application in real-world scenarios, due to the growing\ndiversity of generative models. Recently, approaches based on a proactive\nframework have shown the possibility of dealing with this limitation. However,\nthese methods suffer from two main limitations, which raises concerns about\npotential vulnerabilities: i) the manipulation detector is not robust to noise\nand hence can be easily fooled; ii) the fact that they rely on fixed\nperturbations for image protection offers a predictable exploit for malicious\nattackers, enabling them to reverse-engineer and evade detection. To overcome\nthis issue we propose PADL, a new solution able to generate image-specific\nperturbations using a symmetric scheme of encoding and decoding based on\ncross-attention, which drastically reduces the possibility of reverse\nengineering, even when evaluated with adaptive attack [31]. Additionally, PADL\nis able to pinpoint manipulated areas, facilitating the identification of\nspecific regions that have undergone alterations, and has more generalization\npower than prior art on held-out generative models. Indeed, although being\ntrained only on an attribute manipulation GAN model [15], our method\ngeneralizes to a range of unseen models with diverse architectural designs,\nsuch as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL.\nAdditionally, we introduce a novel evaluation protocol, which offers a fair\nevaluation of localisation performance in function of detection accuracy and\nbetter captures real-world scenarios.\n","authors":["Filippo Bartolucci","Iacopo Masi","Giuseppe Lisanti"],"pdf_url":"https://arxiv.org/pdf/2409.17941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15268v4","updated":"2024-09-26T15:10:58Z","published":"2023-12-23T14:36:27Z","title":"Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in\n Dynamic Scenes","summary":" Despite advancements in self-supervised monocular depth estimation,\nchallenges persist in dynamic scenarios due to the dependence on assumptions\nabout a static world. In this paper, we present Manydepth2, a Motion-Guided\nCost Volume Depth Net, to achieve precise depth estimation for both dynamic\nobjects and static backgrounds, all while maintaining computational efficiency.\nTo tackle the challenges posed by dynamic content, we incorporate optical flow\nand coarse monocular depth to create a novel static reference frame. This frame\nis then utilized to build a motion-guided cost volume in collaboration with the\ntarget frame. Additionally, to enhance the accuracy and resilience of the\nnetwork structure, we introduce an attention-based depth net architecture to\neffectively integrate information from feature maps with varying resolutions.\nCompared to methods with similar computational costs, Manydepth2 achieves a\nsignificant reduction of approximately five percent in root-mean-square error\nfor self-supervised monocular depth estimation on the KITTI-2015 dataset. The\ncode could be found: https://github.com/kaichen-z/Manydepth2\n","authors":["Kaichen Zhou","Jia-Wang Bian","Qian Xie","Jian-Qing Zheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2312.15268v4.pdf","comment":"Monocular Depth Estimation, Self-Supervised, Optical Flow"},{"id":"http://arxiv.org/abs/2409.07714v2","updated":"2024-09-26T15:05:43Z","published":"2024-09-12T02:50:04Z","title":"CollaMamba: Efficient Collaborative Perception with Cross-Agent\n Spatial-Temporal State Space Model","summary":" By sharing complementary perceptual information, multi-agent collaborative\nperception fosters a deeper understanding of the environment. Recent studies on\ncollaborative perception mostly utilize CNNs or Transformers to learn feature\nrepresentation and fusion in the spatial dimension, which struggle to handle\nlong-range spatial-temporal features under limited computing and communication\nresources. Holistically modeling the dependencies over extensive spatial areas\nand extended temporal frames is crucial to enhancing feature quality. To this\nend, we propose a resource efficient cross-agent spatial-temporal collaborative\nstate space model (SSM), named CollaMamba. Initially, we construct a\nfoundational backbone network based on spatial SSM. This backbone adeptly\ncaptures positional causal dependencies from both single-agent and cross-agent\nviews, yielding compact and comprehensive intermediate features while\nmaintaining linear complexity. Furthermore, we devise a history-aware feature\nboosting module based on temporal SSM, extracting contextual cues from extended\nhistorical frames to refine vague features while preserving low overhead.\nExtensive experiments across several datasets demonstrate that CollaMamba\noutperforms state-of-the-art methods, achieving higher model accuracy while\nreducing computational and communication overhead by up to 71.9% and 1/64,\nrespectively. This work pioneers the exploration of the Mamba's potential in\ncollaborative perception. The source code will be made available.\n","authors":["Yang Li","Quan Yuan","Guiyang Luo","Xiaoyuan Fu","Xuanhan Zhu","Yujia Yang","Rui Pan","Jinglin Li"],"pdf_url":"https://arxiv.org/pdf/2409.07714v2.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17924v1","updated":"2024-09-26T15:05:29Z","published":"2024-09-26T15:05:29Z","title":"Neural Light Spheres for Implicit Image Stitching and View Synthesis","summary":" Challenging to capture, and challenging to display on a cellphone screen, the\npanorama paradoxically remains both a staple and underused feature of modern\nmobile camera applications. In this work we address both of these challenges\nwith a spherical neural light field model for implicit panoramic image\nstitching and re-rendering; able to accommodate for depth parallax,\nview-dependent lighting, and local scene motion and color changes during\ncapture. Fit during test-time to an arbitrary path panoramic video capture --\nvertical, horizontal, random-walk -- these neural light spheres jointly\nestimate the camera path and a high-resolution scene reconstruction to produce\nnovel wide field-of-view projections of the environment. Our single-layer model\navoids expensive volumetric sampling, and decomposes the scene into compact\nview-dependent ray offset and color components, with a total model size of 80\nMB per scene, and real-time (50 FPS) rendering at 1080p resolution. We\ndemonstrate improved reconstruction quality over traditional image stitching\nand radiance field methods, with significantly higher tolerance to scene motion\nand non-ideal capture settings.\n","authors":["Ilya Chugunov","Amogh Joshi","Kiran Murthy","Francois Bleibel","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2409.17924v1.pdf","comment":"Project site: https://light.princeton.edu/publication/neuls/"},{"id":"http://arxiv.org/abs/2409.17920v1","updated":"2024-09-26T15:04:13Z","published":"2024-09-26T15:04:13Z","title":"Resolving Multi-Condition Confusion for Finetuning-Free Personalized\n Image Generation","summary":" Personalized text-to-image generation methods can generate customized images\nbased on the reference images, which have garnered wide research interest.\nRecent methods propose a finetuning-free approach with a decoupled\ncross-attention mechanism to generate personalized images requiring no\ntest-time finetuning. However, when multiple reference images are provided, the\ncurrent decoupled cross-attention mechanism encounters the object confusion\nproblem and fails to map each reference image to its corresponding object,\nthereby seriously limiting its scope of application. To address the object\nconfusion problem, in this work we investigate the relevance of different\npositions of the latent image features to the target object in diffusion model,\nand accordingly propose a weighted-merge method to merge multiple reference\nimage features into the corresponding objects. Next, we integrate this\nweighted-merge method into existing pre-trained models and continue to train\nthe model on a multi-object dataset constructed from the open-sourced SA-1B\ndataset. To mitigate object confusion and reduce training costs, we propose an\nobject quality score to estimate the image quality for the selection of\nhigh-quality training samples. Furthermore, our weighted-merge training\nframework can be employed on single-object generation when a single object has\nmultiple reference images. The experiments verify that our method achieves\nsuperior performance to the state-of-the-arts on the Concept101 dataset and\nDreamBooth dataset of multi-object personalized image generation, and\nremarkably improves the performance on single-object personalized image\ngeneration. Our code is available at https://github.com/hqhQAQ/MIP-Adapter.\n","authors":["Qihan Huang","Siming Fu","Jinlong Liu","Hao Jiang","Yipeng Yu","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2409.17920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17917v1","updated":"2024-09-26T15:02:50Z","published":"2024-09-26T15:02:50Z","title":"WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D\n Gaussians","summary":" While style transfer techniques have been well-developed for 2D image\nstylization, the extension of these methods to 3D scenes remains relatively\nunexplored. Existing approaches demonstrate proficiency in transferring colors\nand textures but often struggle with replicating the geometry of the scenes. In\nour work, we leverage an explicit Gaussian Splatting (GS) representation and\ndirectly match the distributions of Gaussians between style and content scenes\nusing the Earth Mover's Distance (EMD). By employing the entropy-regularized\nWasserstein-2 distance, we ensure that the transformation maintains spatial\nsmoothness. Additionally, we decompose the scene stylization problem into\nsmaller chunks to enhance efficiency. This paradigm shift reframes stylization\nfrom a pure generative process driven by latent space losses to an explicit\nmatching of distributions between two Gaussian representations. Our method\nachieves high-resolution 3D stylization by faithfully transferring details from\n3D style scenes onto the content scene. Furthermore, WaSt-3D consistently\ndelivers results across diverse content and style scenes without necessitating\nany training, as it relies solely on optimization-based techniques. See our\nproject page for additional results and source code:\n$\\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$.\n","authors":["Dmytro Kotovenko","Olga Grebenkova","Nikolaos Sarafianos","Avinash Paliwal","Pingchuan Ma","Omid Poursaeed","Sreyas Mohan","Yuchen Fan","Yilei Li","Rakesh Ranjan","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2409.17917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01895v2","updated":"2024-09-26T14:57:13Z","published":"2024-08-04T01:34:22Z","title":"Computational Trichromacy Reconstruction: Empowering the Color-Vision\n Deficient to Recognize Colors Using Augmented Reality","summary":" We propose an assistive technology that helps individuals with Color Vision\nDeficiencies (CVD) to recognize/name colors. A dichromat's color perception is\na reduced two-dimensional (2D) subset of a normal trichromat's three\ndimensional color (3D) perception, leading to confusion when visual stimuli\nthat appear identical to the dichromat are referred to by different color\nnames. Using our proposed system, CVD individuals can interactively induce\ndistinct perceptual changes to originally confusing colors via a computational\ncolor space transformation. By combining their original 2D precepts for colors\nwith the discriminative changes, a three dimensional color space is\nreconstructed, where the dichromat can learn to resolve color name confusions\nand accurately recognize colors. Our system is implemented as an Augmented\nReality (AR) interface on smartphones, where users interactively control the\nrotation through swipe gestures and observe the induced color shifts in the\ncamera view or in a displayed image. Through psychophysical experiments and a\nlongitudinal user study, we demonstrate that such rotational color shifts have\ndiscriminative power (initially confusing colors become distinct under\nrotation) and exhibit structured perceptual shifts dichromats can learn with\nmodest training. The AR App is also evaluated in two real-world scenarios\n(building with lego blocks and interpreting artistic works); users all report\npositive experience in using the App to recognize object colors that they\notherwise could not.\n","authors":["Yuhao Zhu","Ethan Chen","Colin Hascup","Yukang Yan","Gaurav Sharma"],"pdf_url":"https://arxiv.org/pdf/2408.01895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17908v1","updated":"2024-09-26T14:52:55Z","published":"2024-09-26T14:52:55Z","title":"LKA-ReID:Vehicle Re-Identification with Large Kernel Attention","summary":" With the rapid development of intelligent transportation systems and the\npopularity of smart city infrastructure, Vehicle Re-ID technology has become an\nimportant research field. The vehicle Re-ID task faces an important challenge,\nwhich is the high similarity between different vehicles. Existing methods use\nadditional detection or segmentation models to extract differentiated local\nfeatures. However, these methods either rely on additional annotations or\ngreatly increase the computational cost. Using attention mechanism to capture\nglobal and local features is crucial to solve the challenge of high similarity\nbetween classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with\nlarge kernel attention. Specifically, the large kernel attention (LKA) utilizes\nthe advantages of self-attention and also benefits from the advantages of\nconvolution, which can extract the global and local features of the vehicle\nmore comprehensively. We also introduce hybrid channel attention (HCA) combines\nchannel attention with spatial information, so that the model can better focus\non channels and feature regions, and ignore background and other disturbing\ninformation. Experiments on VeRi-776 dataset demonstrated the effectiveness of\nLKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%.\n","authors":["Xuezhi Xiang","Zhushan Ma","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17908v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17895v1","updated":"2024-09-26T14:44:41Z","published":"2024-09-26T14:44:41Z","title":"Self-supervised Monocular Depth Estimation with Large Kernel Attention","summary":" Self-supervised monocular depth estimation has emerged as a promising\napproach since it does not rely on labeled training data. Most methods combine\nconvolution and Transformer to model long-distance dependencies to estimate\ndepth accurately. However, Transformer treats 2D image features as 1D\nsequences, and positional encoding somewhat mitigates the loss of spatial\ninformation between different feature blocks, tending to overlook channel\nfeatures, which limit the performance of depth estimation. In this paper, we\npropose a self-supervised monocular depth estimation network to get finer\ndetails. Specifically, we propose a decoder based on large kernel attention,\nwhich can model long-distance dependencies without compromising the\ntwo-dimension structure of features while maintaining feature channel\nadaptivity. In addition, we introduce a up-sampling module to accurately\nrecover the fine details in the depth map. Our method achieves competitive\nresults on the KITTI dataset.\n","authors":["Xuezhi Xiang","Yao Wang","Lei Zhang","Denis Ombati","Himaloy Himu","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2409.17895v1.pdf","comment":"The paper is under consideration at 2025 IEEE International\n Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.17886v1","updated":"2024-09-26T14:35:06Z","published":"2024-09-26T14:35:06Z","title":"Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze\n Target Detection","summary":" Gaze Target Detection (GTD), i.e., determining where a person is looking\nwithin a scene from an external viewpoint, is a challenging task, particularly\nin 3D space. Existing approaches heavily rely on analyzing the person's\nappearance, primarily focusing on their face to predict the gaze target. This\npaper presents a novel approach to tackle this problem by utilizing the\nperson's upper-body pose and available depth maps to extract a 3D gaze\ndirection and employing a multi-stage or an end-to-end pipeline to predict the\ngazed target. When predicted accurately, the human body pose can provide\nvaluable information about the head pose, which is a good approximation of the\ngaze direction, as well as the position of the arms and hands, which are linked\nto the activity the person is performing and the objects they are likely\nfocusing on. Consequently, in addition to performing gaze estimation in 3D, we\nare also able to perform GTD simultaneously. We demonstrate state-of-the-art\nresults on the most comprehensive publicly accessible 3D gaze target detection\ndataset without requiring images of the person's face, thus promoting privacy\npreservation in various application contexts. The code is available at\nhttps://github.com/intelligolabs/privacy-gtd-3D.\n","authors":["Andrea Toaiari","Vittorio Murino","Marco Cristani","Cigdem Beyan"],"pdf_url":"https://arxiv.org/pdf/2409.17886v1.pdf","comment":"Accepted in the T-CAP workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2312.04564v3","updated":"2024-09-26T14:33:24Z","published":"2023-12-07T18:59:55Z","title":"EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS","summary":" Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view\nscene synthesis. It addresses the challenges of lengthy training times and slow\nrendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid,\ndifferentiable rasterization of 3D Gaussians, 3D-GS achieves real-time\nrendering and accelerated training. They, however, demand substantial memory\nresources for both training and storage, as they require millions of Gaussians\nin their point cloud representation for each scene. We present a technique\nutilizing quantized embeddings to significantly reduce per-point memory storage\nrequirements and a coarse-to-fine training strategy for a faster and more\nstable optimization of the Gaussian point clouds. Our approach develops a\npruning stage which results in scene representations with fewer Gaussians,\nleading to faster training times and rendering speeds for real-time rendering\nof high resolution scenes. We reduce storage memory by more than an order of\nmagnitude all while preserving the reconstruction quality. We validate the\neffectiveness of our approach on a variety of datasets and scenes preserving\nthe visual quality while consuming 10-20x lesser memory and faster\ntraining/inference speed. Project page and code is available\nhttps://efficientgaussian.github.io\n","authors":["Sharath Girish","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04564v3.pdf","comment":"Website: https://efficientgaussian.github.io Code:\n https://github.com/Sharath-girish/efficientgaussian"},{"id":"http://arxiv.org/abs/2409.17880v1","updated":"2024-09-26T14:27:55Z","published":"2024-09-26T14:27:55Z","title":"Self-Distilled Depth Refinement with Noisy Poisson Fusion","summary":" Depth refinement aims to infer high-resolution depth with fine-grained edges\nand details, refining low-resolution results of depth estimation models. The\nprevailing methods adopt tile-based manners by merging numerous patches, which\nlacks efficiency and produces inconsistency. Besides, prior arts suffer from\nfuzzy depth boundaries and limited generalizability. Analyzing the fundamental\nreasons for these limitations, we model depth refinement as a noisy Poisson\nfusion problem with local inconsistency and edge deformation noises. We propose\nthe Self-distilled Depth Refinement (SDDR) framework to enforce robustness\nagainst the noises, which mainly consists of depth edge representation and\nedge-based guidance. With noisy depth predictions as input, SDDR generates\nlow-noise depth edge representations as pseudo-labels by coarse-to-fine\nself-distillation. Edge-based guidance with edge-guided gradient loss and\nedge-based fusion loss serves as the optimization objective equivalent to\nPoisson fusion. When depth maps are better refined, the labels also become more\nnoise-free. Our model can acquire strong robustness to the noises, achieving\nsignificant improvements in accuracy, edge quality, efficiency, and\ngeneralizability on five different benchmarks. Moreover, directly training\nanother model with edge labels produced by SDDR brings improvements, suggesting\nthat our method could help with training robust refinement models in future\nworks.\n","authors":["Jiaqi Li","Yiran Wang","Jinghong Zheng","Zihao Huang","Ke Xian","Zhiguo Cao","Jianming Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17880v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.09946v2","updated":"2024-09-26T14:27:23Z","published":"2024-07-13T17:03:16Z","title":"Low-Rank Interconnected Adaptation across Layers","summary":" Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning\nmethod that utilizes low-rank projectors $A$ and $B$ to learn weight updates\n$\\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is\nessentially a gradient compressor, performing random projections on the\ngradient using a fixed projection matrix $A_0$. However, this setup restricts\nthe overall weight update to be low-rank, which limits the adaptation\nperformance. In this paper, we propose low-rank interconnected adaptation\nacross layers (Lily). Specifically, we employ a hierarchical framework where\nlow-dimensional projectors (LPs) retained for downward projection at a\nparticular level, while globally-shared high-dimensional projector (HP) experts\nperform upward projection across all levels of layers. Lily uniquely connects\neach LP to all HP experts, therefore the gradient projections are no longer\ndominated by fixed projection matrices, but rather by selective combinations of\nall the projectors, thereby breaking the low-rank constraint of LoRA.\nFurthermore, Lily's cross-layer connections facilitate the capture of intricate\ninformation and dependencies across different layers, thereby enhancing the\nmodel's representational capabilities. Experiments across various modalities,\narchitectures, and model sizes underscore Lily's great performance and\nefficiency. Code is available on github https://github.com/yibozhong/lily.\n","authors":["Yibo Zhong","Yao Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.09946v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2409.17854v1","updated":"2024-09-26T14:00:00Z","published":"2024-09-26T14:00:00Z","title":"Visualization of Age Distributions as Elements of Medical Data-Stories","summary":" In various fields, including medicine, age distributions are crucial. Despite\nwidespread media coverage of health topics, there remains a need to enhance\nhealth communication. Narrative medical visualization is promising for\nimproving information comprehension and retention. This study explores the most\neffective ways to present age distributions of diseases through narrative\nvisualizations. We conducted a thorough analysis of existing visualizations,\nheld workshops with a broad audience, and reviewed relevant literature. From\nthis, we identified design choices focusing on comprehension, aesthetics,\nengagement, and memorability. We specifically tested three pictogram variants:\npictograms as bars, stacked pictograms, and annotations. After evaluating 18\nvisualizations with 72 participants and three expert reviews, we determined\nthat annotations were most effective for comprehension and aesthetics. However,\ntraditional bar charts were preferred for engagement, and other variants were\nmore memorable. The study provides a set of design recommendations based on\nthese insights.\n","authors":["Sophia Dowlatabadi","Bernhard Preim","Monique Meuschke"],"pdf_url":"https://arxiv.org/pdf/2409.17854v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17851v1","updated":"2024-09-26T13:57:05Z","published":"2024-09-26T13:57:05Z","title":"A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts","summary":" Monocular depth estimation is a critical task for autonomous driving and many\nother computer vision applications. While significant progress has been made in\nthis field, the effects of viewpoint shifts on depth estimation models remain\nlargely underexplored. This paper introduces a novel dataset and evaluation\nmethodology to quantify the impact of different camera positions and\norientations on monocular depth estimation performance. We propose a ground\ntruth strategy based on homography estimation and object detection, eliminating\nthe need for expensive lidar sensors. We collect a diverse dataset of road\nscenes from multiple viewpoints and use it to assess the robustness of a modern\ndepth estimation model to geometric shifts. After assessing the validity of our\nstrategy on a public dataset, we provide valuable insights into the limitations\nof current models and highlight the importance of considering viewpoint\nvariations in real-world applications.\n","authors":["Aurel Pjetri","Stefano Caprasecca","Leonardo Taccari","Matteo Simoncini","Henrique Piñeiro Monteagudo","Walter Wallace","Douglas Coimbra de Andrade","Francesco Sambo","Andrew David Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2409.17851v1.pdf","comment":"17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on\n Vision-Centric Autonomous Driving (VCAD)"},{"id":"http://arxiv.org/abs/2404.04693v2","updated":"2024-09-26T13:53:33Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v2.pdf","comment":"2024 IEEE International Conference on Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2403.10542v2","updated":"2024-09-26T13:38:48Z","published":"2024-03-08T23:04:14Z","title":"SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator","summary":" Generative Artificial Intelligence (AI) has become incredibly popular in\nrecent years, and the significance of traditional accelerators in dealing with\nlarge-scale parameters is urgent. With the diffusion model's parallel\nstructure, the hardware design challenge has skyrocketed because of the\nmultiple layers operating simultaneously. Convolution Neural Network (CNN)\naccelerators have been designed and developed rapidly, especially for\nhigh-speed inference. Often, CNN models with parallel structures are deployed.\nIn these CNN accelerators, many Processing Elements (PE) are required to\nperform parallel computations, mainly the multiply and accumulation (MAC)\noperation, resulting in high power consumption and a large silicon area. In\nthis work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce\nthe number of PE while improving the operation efficiency of the CNN\naccelerator. The pipelining technique is introduced into Server Flow to process\nparallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS\ntechnology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation\nresults show that the proposed SF-MMCN can reduce the power consumption by 92%,\nand the silicon area by 70%, while improving the efficiency of operation by\nnearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to\nevaluate the performance of the accelerator in terms of the ratio throughput\n(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency\nby 18 times (18.42).\n","authors":["Huan-Ke Hsu","I-Chyn Wey","T. Hui Teo"],"pdf_url":"https://arxiv.org/pdf/2403.10542v2.pdf","comment":"16 pages, 16 figures; extend the CNN to process Diffusion Model\n (possible this is the first reported hardware Diffusion Model implementation)"},{"id":"http://arxiv.org/abs/2407.17380v2","updated":"2024-09-26T13:37:04Z","published":"2024-07-24T16:04:18Z","title":"2D and 3D Deep Learning Models for MRI-based Parkinson's Disease\n Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold\n Networks, Convolutional Neural Networks, and Graph Convolutional Networks","summary":" Parkinson's Disease (PD) diagnosis remains challenging. This study applies\nConvolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable\nspline-based activation functions into convolutional layers, for PD\nclassification using structural MRI. The first 3D implementation of ConvKANs\nfor medical imaging is presented, comparing their performance to Convolutional\nNeural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three\nopen-source datasets. Isolated analyses assessed performance within individual\ndatasets, using cross-validation techniques. Holdout analyses evaluated\ncross-dataset generalizability by training models on two datasets and testing\non the third, mirroring real-world clinical scenarios. In isolated analyses, 2D\nConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI\ndataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed\npromise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout\nanalyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of\n0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D\nimplementations. These findings highlight ConvKANs' potential for PD detection,\nemphasize the importance of 3D analysis in capturing subtle brain changes, and\nunderscore cross-dataset generalization challenges. This study advances\nAI-assisted PD diagnosis using structural MRI and emphasizes the need for\nlarger-scale validation.\n","authors":["Salil B Patel","Vicky Goh","James F FitzGerald","Chrystalina A Antoniades"],"pdf_url":"https://arxiv.org/pdf/2407.17380v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17830v1","updated":"2024-09-26T13:29:40Z","published":"2024-09-26T13:29:40Z","title":"Unsupervised Learning Based Multi-Scale Exposure Fusion","summary":" Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient\nfor fusing differently exposed low dynamic range (LDR) images into a higher\nquality LDR image for a high dynamic range (HDR) scene. Unlike supervised\nlearning, loss functions play a crucial role in the ULMEF. In this paper, novel\nloss functions are proposed for the ULMEF and they are defined by using all the\nimages to be fused and other differently exposed images from the same HDR\nscene. The proposed loss functions can guide the proposed ULMEF to learn more\nreliable information from the HDR scene than existing loss functions which are\ndefined by only using the set of images to be fused. As such, the quality of\nthe fused image is significantly improved. The proposed ULMEF also adopts a\nmulti-scale strategy that includes a multi-scale attention module to\neffectively preserve the scene depth and local contrast in the fused image.\nMeanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation\nand exposure extrapolation. Extensive experiments show that the proposed ULMEF\nalgorithm outperforms state-of-the-art exposure fusion algorithms.\n","authors":["Chaobing Zheng","Shiqian Wu","Zhenggguo Li"],"pdf_url":"https://arxiv.org/pdf/2409.17830v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.17823v1","updated":"2024-09-26T13:21:02Z","published":"2024-09-26T13:21:02Z","title":"Kendall's $τ$ Coefficient for Logits Distillation","summary":" Knowledge distillation typically employs the Kullback-Leibler (KL) divergence\nto constrain the student model's output to match the soft labels provided by\nthe teacher model exactly. However, sometimes the optimization direction of the\nKL divergence loss is not always aligned with the task loss, where a smaller KL\ndivergence could lead to erroneous predictions that diverge from the soft\nlabels. This limitation often results in suboptimal optimization for the\nstudent. Moreover, even under temperature scaling, the KL divergence loss\nfunction tends to overly focus on the larger-valued channels in the logits,\ndisregarding the rich inter-class information provided by the multitude of\nsmaller-valued channels. This hard constraint proves too challenging for\nlightweight students, hindering further knowledge distillation. To address this\nissue, we propose a plug-and-play ranking loss based on Kendall's $\\tau$\ncoefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances\nthe attention to smaller-valued channels by constraining the order of channel\nvalues in student logits, providing more inter-class relational information.\nThe rank constraint on the top-valued channels helps avoid suboptimal traps\nduring optimization. We also discuss different differentiable forms of\nKendall's $\\tau$ coefficient and demonstrate that the proposed ranking loss\nfunction shares a consistent optimization objective with the KL divergence.\nExtensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD\ncan enhance the performance of various knowledge distillation baselines and\noffer broad improvements across multiple teacher-student architecture\ncombinations.\n","authors":["Yuchen Guan","Runxi Cheng","Kang Liu","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16042v2","updated":"2024-09-26T13:18:24Z","published":"2024-09-24T12:44:27Z","title":"Enhanced Unsupervised Image-to-Image Translation Using Contrastive\n Learning and Histogram of Oriented Gradients","summary":" Image-to-Image Translation is a vital area of computer vision that focuses on\ntransforming images from one visual domain to another while preserving their\ncore content and structure. However, this field faces two major challenges:\nfirst, the data from the two domains are often unpaired, making it difficult to\ntrain generative adversarial networks effectively; second, existing methods\ntend to produce artifacts or hallucinations during image generation, leading to\na decline in image quality. To address these issues, this paper proposes an\nenhanced unsupervised image-to-image translation method based on the\nContrastive Unpaired Translation (CUT) model, incorporating Histogram of\nOriented Gradients (HOG) features. This novel approach ensures the preservation\nof the semantic structure of images, even without semantic labels, by\nminimizing the loss between the HOG features of input and generated images. The\nmethod was tested on translating synthetic game environments from GTA5 dataset\nto realistic urban scenes in cityscapes dataset, demonstrating significant\nimprovements in reducing hallucinations and enhancing image quality.\n","authors":["Wanchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.16042v2.pdf","comment":"Critical Errors in Data or Analysis"},{"id":"http://arxiv.org/abs/2409.17805v1","updated":"2024-09-26T12:58:01Z","published":"2024-09-26T12:58:01Z","title":"Cascade Prompt Learning for Vision-Language Model Adaptation","summary":" Prompt learning has surfaced as an effective approach to enhance the\nperformance of Vision-Language Models (VLMs) like CLIP when applied to\ndownstream tasks. However, current learnable prompt tokens are primarily used\nfor the single phase of adapting to tasks (i.e., adapting prompt), easily\nleading to overfitting risks. In this work, we propose a novel Cascade Prompt\nLearning CasPL framework to enable prompt learning to serve both generic and\nspecific expertise (i.e., boosting and adapting prompt) simultaneously.\nSpecifically, CasPL is a new learning paradigm comprising two distinct phases\nof learnable prompts: the first boosting prompt is crafted to extract\ndomain-general knowledge from a senior larger CLIP teacher model by aligning\ntheir predicted logits using extensive unlabeled domain images. The second\nadapting prompt is then cascaded with the frozen first set to fine-tune the\ndownstream tasks, following the approaches employed in prior research. In this\nmanner, CasPL can effectively capture both domain-general and task-specific\nrepresentations into explicitly different gradual groups of prompts, thus\npotentially alleviating overfitting issues in the target domain. It's worth\nnoting that CasPL serves as a plug-and-play module that can seamlessly\nintegrate into any existing prompt learning approach. CasPL achieves a\nsignificantly better balance between performance and inference speed, which is\nespecially beneficial for deploying smaller VLM models in resource-constrained\nenvironments. Compared to the previous state-of-the-art method PromptSRC, CasPL\nshows an average improvement of 1.85% for base classes, 3.44% for novel\nclasses, and 2.72% for the harmonic mean over 11 image classification datasets.\nCode is publicly available at: https://github.com/megvii-research/CasPL.\n","authors":["Ge Wu","Xin Zhang","Zheng Li","Zhaowei Chen","Jiajun Liang","Jian Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17805v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17792v1","updated":"2024-09-26T12:37:50Z","published":"2024-09-26T12:37:50Z","title":"Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework\n with Misaligned Training Pairs","summary":" For single image defocus deblurring, acquiring well-aligned training pairs\n(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp\nimage (and a defocus blur map), is an intricate task for the development of\ndeblurring models. Existing image defocus deblurring methods typically rely on\ntraining data collected by specialized imaging equipment, presupposing that\nthese pairs or triplets are perfectly aligned. However, in practical scenarios\ninvolving the collection of real-world data, direct acquisition of training\ntriplets is infeasible, and training pairs inevitably encounter spatial\nmisalignment issues. In this work, we introduce a reblurring-guided learning\nframework for single image defocus deblurring, enabling the learning of a\ndeblurring network even with misaligned training pairs. Specifically, we first\npropose a baseline defocus deblurring network that utilizes spatially varying\ndefocus blur map as degradation prior to enhance the deblurring performance.\nThen, to effectively learn the baseline defocus deblurring network with\nmisaligned training pairs, our reblurring module ensures spatial consistency\nbetween the deblurred image, the reblurred image and the input blurry image by\nreconstructing spatially variant isotropic blur kernels. Moreover, the\nspatially variant blur derived from the reblurring module can serve as pseudo\nsupervision for defocus blur map during training, interestingly transforming\ntraining pairs into training triplets. Additionally, we have collected a new\ndataset specifically for single image defocus deblurring (SDD) with typical\nmisalignments, which not only substantiates our proposed method but also serves\nas a benchmark for future research.\n","authors":["Xinya Shu","Yu Li","Dongwei Ren","Xiaohe Wu","Jin Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2409.17792v1.pdf","comment":"The source code and dataset are available at\n https://github.com/ssscrystal/Reblurring-guided-JDRL"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2405.07865v4","updated":"2024-09-26T12:18:49Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n Driving","summary":" The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v4.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop"},{"id":"http://arxiv.org/abs/2409.17778v1","updated":"2024-09-26T12:16:11Z","published":"2024-09-26T12:16:11Z","title":"Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs","summary":" Diffusion-based image super-resolution (SR) models have attracted substantial\ninterest due to their powerful image restoration capabilities. However,\nprevailing diffusion models often struggle to strike an optimal balance between\nefficiency and performance. Typically, they either neglect to exploit the\npotential of existing extensive pretrained models, limiting their generative\ncapacity, or they necessitate a dozens of forward passes starting from random\nnoises, compromising inference efficiency. In this paper, we present DoSSR, a\nDomain Shift diffusion-based SR model that capitalizes on the generative powers\nof pretrained diffusion models while significantly enhancing efficiency by\ninitiating the diffusion process with low-resolution (LR) images. At the core\nof our approach is a domain shift equation that integrates seamlessly with\nexisting diffusion models. This integration not only improves the use of\ndiffusion prior but also boosts inference efficiency. Moreover, we advance our\nmethod by transitioning the discrete shift process to a continuous formulation,\ntermed as DoS-SDEs. This advancement leads to the fast and customized solvers\nthat further enhance sampling efficiency. Empirical results demonstrate that\nour proposed method achieves state-of-the-art performance on synthetic and\nreal-world datasets, while notably requiring only 5 sampling steps. Compared to\nprevious diffusion prior based methods, our approach achieves a remarkable\nspeedup of 5-7 times, demonstrating its superior efficiency. Code:\nhttps://github.com/QinpengCui/DoSSR.\n","authors":["Qinpeng Cui","Yixuan Liu","Xinyi Zhang","Qiqi Bao","Zhongdao Wang","Qingmin Liao","Li Wang","Tian Lu","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2409.17778v1.pdf","comment":"This paper is accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17777v1","updated":"2024-09-26T12:15:13Z","published":"2024-09-26T12:15:13Z","title":"Harnessing Shared Relations via Multimodal Mixup Contrastive Learning\n for Multimodal Classification","summary":" Deep multimodal learning has shown remarkable success by leveraging\ncontrastive learning to capture explicit one-to-one relations across\nmodalities. However, real-world data often exhibits shared relations beyond\nsimple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive\nLearning approach to capture nuanced shared relations inherent in multimodal\ndata. Our key contribution is a Mixup-based contrastive loss that learns robust\nrepresentations by aligning mixed samples from one modality with their\ncorresponding samples from other modalities thereby capturing shared relations\nbetween them. For multimodal classification tasks, we introduce a framework\nthat integrates a fusion module with unimodal prediction modules for auxiliary\nsupervision during training, complemented by our proposed Mixup-based\ncontrastive loss. Through extensive experiments on diverse datasets (N24News,\nROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures\nshared multimodal relations and generalizes across domains. It outperforms\nstate-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving\ncomparable performance on Food-101. Our work highlights the significance of\nlearning shared relations for robust multimodal learning, opening up promising\navenues for future research.\n","authors":["Raja Kumar","Raghav Singhal","Pranamya Kulkarni","Deval Mehta","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2409.17777v1.pdf","comment":"RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9\n Tables"},{"id":"http://arxiv.org/abs/2409.17775v1","updated":"2024-09-26T12:13:52Z","published":"2024-09-26T12:13:52Z","title":"UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in\n Histopathology","summary":" Background: The integration of multi-stain histopathology images through deep\nlearning poses a significant challenge in digital histopathology. Current\nmulti-modal approaches struggle with data heterogeneity and missing data. This\nstudy aims to overcome these limitations by developing a novel transformer\nmodel for multi-stain integration that can handle missing data during training\nas well as inference. Methods: We propose UNICORN (UNiversal modality\nIntegration Network for CORonary classificatioN) a multi-modal transformer\ncapable of processing multi-stain histopathology for atherosclerosis severity\nclass prediction. The architecture comprises a two-stage, end-to-end trainable\nmodel with specialized modules utilizing transformer self-attention blocks. The\ninitial stage employs domain-specific expert modules to extract features from\neach modality. In the subsequent stage, an aggregation expert module integrates\nthese features by learning the interactions between the different data\nmodalities. Results: Evaluation was performed using a multi-class dataset of\natherosclerotic lesions from the Munich Cardiovascular Studies Biobank\n(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from\n170 deceased individuals on 7 prespecified segments of the coronary tree, each\nstained according to four histopathological protocols. UNICORN achieved a\nclassification accuracy of 0.67, outperforming other state-of-the-art models.\nThe model effectively identifies relevant tissue phenotypes across stainings\nand implicitly models disease progression. Conclusion: Our proposed multi-modal\ntransformer model addresses key challenges in medical data analysis, including\ndata heterogeneity and missing modalities. Explainability and the model's\neffectiveness in predicting atherosclerosis progression underscores its\npotential for broader applications in medical research.\n","authors":["Valentin Koch","Sabine Bauer","Valerio Luppberger","Michael Joner","Heribert Schunkert","Julia A. Schnabel","Moritz von Scheidt","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2409.17775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2409.17759v1","updated":"2024-09-26T11:53:25Z","published":"2024-09-26T11:53:25Z","title":"LGFN: Lightweight Light Field Image Super-Resolution using Local\n Convolution Modulation and Global Attention Feature Extraction","summary":" Capturing different intensity and directions of light rays at the same scene\nLight field (LF) can encode the 3D scene cues into a 4D LF image which has a\nwide range of applications (i.e. post-capture refocusing and depth sensing). LF\nimage super-resolution (SR) aims to improve the image resolution limited by the\nperformance of LF camera sensor. Although existing methods have achieved\npromising results the practical application of these models is limited because\nthey are not lightweight enough. In this paper we propose a lightweight model\nnamed LGFN which integrates the local and global features of different views\nand the features of different channels for LF image SR. Specifically owing to\nneighboring regions of the same pixel position in different sub-aperture images\nexhibit similar structural relationships we design a lightweight CNN-based\nfeature extraction module (namely DGCE) to extract local features better\nthrough feature modulation. Meanwhile as the position beyond the boundaries in\nthe LF image presents a large disparity we propose an efficient spatial\nattention module (namely ESAM) which uses decomposable large-kernel convolution\nto obtain an enlarged receptive field and an efficient channel attention module\n(namely ECAM). Compared with the existing LF image SR models with large\nparameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has\nachieved a competitive effect. Extensive experiments with ablation studies\ndemonstrate the effectiveness of our proposed method which ranked the second\nplace in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super\nResolution Challenge and the seventh place in the Track 1 Fidelity.\n","authors":["Zhongxin Yu","Liang Chen","Zhiyun Zeng","Kunping Yang","Shaofei Luo","Shaorui Chen","Cheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.17759v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.09369v3","updated":"2024-09-26T11:38:12Z","published":"2024-09-14T08:47:45Z","title":"Interpretable Vision-Language Survival Analysis with Ordinal Inductive\n Bias for Computational Pathology","summary":" Histopathology Whole-Slide Images (WSIs) provide an important tool to assess\ncancer prognosis in computational pathology (CPATH). While existing survival\nanalysis (SA) approaches have made exciting progress, they are generally\nlimited to adopting highly-expressive architectures and only coarse-grained\npatient-level labels to learn prognostic visual representations from gigapixel\nWSIs. Such learning paradigm suffers from important performance bottlenecks,\nwhen facing present scarce training data and standard multi-instance learning\n(MIL) framework in CPATH. To overcome it, this paper, for the first time,\nproposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA\nis driven by pathology VL foundation models. It no longer relies on\nhigh-capability networks and shows the advantage of data efficiency. (2) In\nvision-end, VLSA encodes prognostic language prior and then employs it as\nauxiliary signals to guide the aggregating of prognostic visual features at\ninstance level, thereby compensating for the weak supervision in MIL. Moreover,\ngiven the characteristics of SA, we propose i) ordinal survival prompt learning\nto transform continuous survival labels into textual prompts; and ii) ordinal\nincidence function as prediction target to make SA compatible with VL-based\nprediction. Notably, VLSA's predictions can be interpreted intuitively by our\nShapley values-based method. The extensive experiments on five datasets confirm\nthe effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH\nby offering weakly-supervised MIL an effective means to learn valuable\nprognostic clues from gigapixel WSIs. Our source code is available at\nhttps://github.com/liupei101/VLSA.\n","authors":["Pei Liu","Luping Ji","Jiaxiang Gou","Bo Fu","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2409.09369v3.pdf","comment":"24 pages, 11 tables, 6 figures"},{"id":"http://arxiv.org/abs/2401.01008v3","updated":"2024-09-26T11:35:22Z","published":"2023-12-13T17:05:37Z","title":"Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models","summary":" Text-to-image diffusion models have demonstrated unprecedented capabilities\nfor flexible and realistic image synthesis. Nevertheless, these models rely on\na time-consuming sampling procedure, which has motivated attempts to reduce\ntheir latency. When improving efficiency, researchers often use the original\ndiffusion model to train an additional network designed specifically for fast\nimage generation. In contrast, our approach seeks to reduce latency directly,\nwithout any retraining, fine-tuning, or knowledge distillation. In particular,\nwe find the repeated calculation of attention maps to be costly yet redundant,\nand instead suggest reusing them during sampling. Our specific reuse strategies\nare based on ODE theory, which implies that the later a map is reused, the\nsmaller the distortion in the final image. We empirically compare these reuse\nstrategies with few-step sampling procedures of comparable latency, finding\nthat reuse generates images that are closer to those produced by the original\nhigh-latency diffusion model.\n","authors":["Rosco Hunter","Łukasz Dudziak","Mohamed S. Abdelfattah","Abhinav Mehrotra","Sourav Bhattacharya","Hongkai Wen"],"pdf_url":"https://arxiv.org/pdf/2401.01008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12844v2","updated":"2024-09-26T11:29:04Z","published":"2024-02-20T09:13:15Z","title":"ICON: Improving Inter-Report Consistency in Radiology Report Generation\n via Lesion-aware Mixup Augmentation","summary":" Previous research on radiology report generation has made significant\nprogress in terms of increasing the clinical accuracy of generated reports. In\nthis paper, we emphasize another crucial quality that it should possess, i.e.,\ninter-report consistency, which refers to the capability of generating\nconsistent reports for semantically equivalent radiographs. This quality is\neven of greater significance than the overall report accuracy in terms of\nensuring the system's credibility, as a system prone to providing conflicting\nresults would severely erode users' trust. Regrettably, existing approaches\nstruggle to maintain inter-report consistency, exhibiting biases towards common\npatterns and susceptibility to lesion variants. To address this issue, we\npropose ICON, which improves the inter-report consistency of radiology report\ngeneration. Aiming to enhance the system's ability to capture similarities in\nsemantically equivalent lesions, our approach first involves extracting lesions\nfrom input images and examining their characteristics. Then, we introduce a\nlesion-aware mixup technique to ensure that the representations of the\nsemantically equivalent lesions align with the same attributes, achieved\nthrough a linear combination during the training phase. Extensive experiments\non three publicly available chest X-ray datasets verify the effectiveness of\nour approach, both in terms of improving the consistency and accuracy of the\ngenerated reports.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17747v1","updated":"2024-09-26T11:23:59Z","published":"2024-09-26T11:23:59Z","title":"Text Image Generation for Low-Resource Languages with Dual Translation\n Learning","summary":" Scene text recognition in low-resource languages frequently faces challenges\ndue to the limited availability of training datasets derived from real-world\nscenes. This study proposes a novel approach that generates text images in\nlow-resource languages by emulating the style of real text images from\nhigh-resource languages. Our approach utilizes a diffusion model that is\nconditioned on binary states: ``synthetic'' and ``real.'' The training of this\nmodel involves dual translation tasks, where it transforms plain text images\ninto either synthetic or real text images, based on the binary states. This\napproach not only effectively differentiates between the two domains but also\nfacilitates the model's explicit recognition of characters in the target\nlanguage. Furthermore, to enhance the accuracy and variety of generated text\nimages, we introduce two guidance techniques: Fidelity-Diversity Balancing\nGuidance and Fidelity Enhancement Guidance. Our experimental results\ndemonstrate that the text images generated by our proposed framework can\nsignificantly improve the performance of scene text recognition models for\nlow-resource languages.\n","authors":["Chihiro Noguchi","Shun Fukuda","Shoichiro Mihara","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2409.17747v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.06945v2","updated":"2024-09-26T11:21:27Z","published":"2024-05-11T07:56:19Z","title":"Direct Learning of Mesh and Appearance via 3D Gaussian Splatting","summary":" Accurately reconstructing a 3D scene including explicit geometry information\nis both attractive and challenging. Geometry reconstruction can benefit from\nincorporating differentiable appearance models, such as Neural Radiance Fields\nand 3D Gaussian Splatting (3DGS). However, existing methods encounter\nefficiency issues due to indirect geometry learning and the paradigm of\nseparately modeling geometry and surface appearance. In this work, we propose a\nlearnable scene model that incorporates 3DGS with an explicit geometry\nrepresentation, namely a mesh. Our model learns the mesh and appearance in an\nend-to-end manner, where we bind 3D Gaussians to the mesh faces and perform\ndifferentiable rendering of 3DGS to obtain photometric supervision. The model\ncreates an effective information pathway to supervise the learning of both 3DGS\nand mesh. Experimental results demonstrate that the learned scene model not\nonly achieves state-of-the-art efficiency and rendering quality but also\nsupports manipulation using the explicit mesh. In addition, our model has a\nunique advantage in adapting to scene updates, thanks to the end-to-end\nlearning of both mesh and appearance.\n","authors":["Ancheng Lin","Jun Li"],"pdf_url":"https://arxiv.org/pdf/2405.06945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17740v1","updated":"2024-09-26T11:15:15Z","published":"2024-09-26T11:15:15Z","title":"AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status","summary":" Diffusion models have made compelling progress on facilitating\nhigh-throughput daily production. Nevertheless, the appealing customized\nrequirements are remain suffered from instance-level finetuning for authentic\nfidelity. Prior zero-shot customization works achieve the semantic consistence\nthrough the condensed injection of identity features, while addressing detailed\nlow-level signatures through complex model configurations and subject-specific\nfabrications, which significantly break the statistical coherence within the\noverall system and limit the applicability across various scenarios. To\nfacilitate the generic signature concentration with rectified efficiency, we\npresent \\textbf{AnyLogo}, a zero-shot region customizer with remarkable detail\nconsistency, building upon the symbiotic diffusion system with eliminated\ncumbersome designs. Streamlined as vanilla image generation, we discern that\nthe rigorous signature extraction and creative content generation are\npromisingly compatible and can be systematically recycled within a single\ndenoising model. In place of the external configurations, the gemini status of\nthe denoising model promote the reinforced subject transmission efficiency and\ndisentangled semantic-signature space with continuous signature decoration.\nMoreover, the sparse recycling paradigm is adopted to prevent the duplicated\nrisk with compressed transmission quota for diversified signature stimulation.\nExtensive experiments on constructed logo-level benchmarks demonstrate the\neffectiveness and practicability of our methods.\n","authors":["Jinghao Zhang","Wen Qian","Hao Luo","Fan Wang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17740v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17729v1","updated":"2024-09-26T10:58:31Z","published":"2024-09-26T10:58:31Z","title":"Neural Implicit Representation for Highly Dynamic LiDAR Mapping and\n Odometry","summary":" Recent advancements in Simultaneous Localization and Mapping (SLAM) have\nincreasingly highlighted the robustness of LiDAR-based techniques. At the same\ntime, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D\nscene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has\nshown notable performance in NeRF-based SLAM applications. However, despite its\nstrengths, these systems often encounter difficulties in dynamic outdoor\nenvironments due to their inherent static assumptions. To address these\nlimitations, this paper proposes a novel method designed to improve\nreconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the\nproposed approach consists of two primary components. First, we separate the\nscene into static background and dynamic foreground. By identifying and\nexcluding dynamic elements from the mapping process, this segmentation enables\nthe creation of a dense 3D map that accurately represents the static background\nonly. The second component extends the octree structure to support\nmulti-resolution representation. This extension not only enhances\nreconstruction quality but also aids in the removal of dynamic objects\nidentified by the first module. Additionally, Fourier feature encoding is\napplied to the sampled points, capturing high-frequency information and leading\nto more complete reconstruction results. Evaluations on various datasets\ndemonstrate that our method achieves more competitive results compared to\ncurrent state-of-the-art approaches.\n","authors":["Qi Zhang","He Wang","Ru Li","Wenbin Li"],"pdf_url":"https://arxiv.org/pdf/2409.17729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17728v1","updated":"2024-09-26T10:57:02Z","published":"2024-09-26T10:57:02Z","title":"AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with\n Alternative Modality Masking","summary":" Camera-LiDAR fusion models significantly enhance perception performance in\nautonomous driving. The fusion mechanism leverages the strengths of each\nmodality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR\nfusion models utilize pre-trained backbones for efficient training. However, we\nargue that directly loading single-modal pre-trained camera and LiDAR backbones\ninto camera-LiDAR fusion models introduces similar feature redundancy across\nmodalities due to the nature of the fusion mechanism. Unfortunately, existing\npruning methods are developed explicitly for single-modal models, and thus,\nthey struggle to effectively identify these specific redundant parameters in\ncamera-LiDAR fusion models. In this paper, to address the issue above on\ncamera-LiDAR fusion models, we propose a novelty pruning framework Alternative\nModality Masking Pruning (AlterMOMA), which employs alternative masking on each\nmodality and identifies the redundant parameters. Specifically, when one\nmodality parameters are masked (deactivated), the absence of features from the\nmasked backbone compels the model to reactivate previous redundant features of\nthe other modality backbone. Therefore, these redundant features and relevant\nredundant parameters can be identified via the reactivation process. The\nredundant parameters can be pruned by our proposed importance score evaluation\nfunction, Alternative Evaluation (AlterEva), which is based on the observation\nof the loss changes when certain modality parameters are activated and\ndeactivated. Extensive experiments on the nuScene and KITTI datasets\nencompassing diverse tasks, baseline models, and pruning algorithms showcase\nthat AlterMOMA outperforms existing pruning methods, attaining state-of-the-art\nperformance.\n","authors":["Shiqi Sun","Yantao Lu","Ning Liu","Bo Jiang","JinChao Chen","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17728v1.pdf","comment":"17 pages, 3 figures, Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17727v1","updated":"2024-09-26T10:56:35Z","published":"2024-09-26T10:56:35Z","title":"Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications","summary":" Vision language models have played a key role in extracting meaningful\nfeatures for various robotic applications. Among these, Contrastive\nLanguage-Image Pretraining (CLIP) is widely used in robotic tasks that require\nboth vision and natural language understanding. However, CLIP was trained\nsolely on static images paired with text prompts and has not yet been fully\nadapted for robotic tasks involving dynamic actions. In this paper, we\nintroduce Robotic-CLIP to enhance robotic perception capabilities. We first\ngather and label large-scale action data, and then build our Robotic-CLIP by\nfine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using\ncontrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's\nstrong image performance while gaining the ability to understand actions in\nrobotic contexts. Intensive experiments show that our Robotic-CLIP outperforms\nother CLIP-based models across various language-driven robotic tasks.\nAdditionally, we demonstrate the practical effectiveness of Robotic-CLIP in\nreal-world grasping applications.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Tung D. Ta","Baoru Huang","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.17727v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.17720v1","updated":"2024-09-26T10:43:09Z","published":"2024-09-26T10:43:09Z","title":"Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations\n Between Initial and Final Scenes","summary":" With robots increasingly collaborating with humans in everyday tasks, it is\nimportant to take steps toward robotic systems capable of understanding the\nenvironment. This work focuses on scene understanding to detect pick and place\ntasks given initial and final images from the scene. To this end, a dataset is\ncollected for object detection and pick and place task detection. A YOLOv5\nnetwork is subsequently trained to detect the objects in the initial and final\nscenes. Given the detected objects and their bounding boxes, two methods are\nproposed to detect the pick and place tasks which transform the initial scene\ninto the final scene. A geometric method is proposed which tracks objects'\nmovements in the two scenes and works based on the intersection of the bounding\nboxes which moved within scenes. Contrarily, the CNN-based method utilizes a\nConvolutional Neural Network to classify objects with intersected bounding\nboxes into 5 classes, showing the spatial relationship between the involved\nobjects. The performed pick and place tasks are then derived from analyzing the\nexperiments with both scenes. Results show that the CNN-based method, using a\nVGG16 backbone, outscores the geometric method by roughly 12 percentage points\nin certain scenarios, with an overall success rate of 84.3%.\n","authors":["Seraj Ghasemi","Hamed Hosseini","MohammadHossein Koosheshi","Mehdi Tale Masouleh","Ahmad Kalhor"],"pdf_url":"https://arxiv.org/pdf/2409.17720v1.pdf","comment":"Conference Paper, ICEE 2024, 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17717v1","updated":"2024-09-26T10:40:23Z","published":"2024-09-26T10:40:23Z","title":"Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit","summary":" In this paper, we introduce Behavior4All, a comprehensive, open-source\ntoolkit for in-the-wild facial behavior analysis, integrating Face\nLocalization, Valence-Arousal Estimation, Basic Expression Recognition and\nAction Unit Detection, all within a single framework. Available in both\nCPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale,\nin-the-wild datasets consisting of over 5 million images from diverse\ndemographic groups. It introduces a novel framework that leverages distribution\nmatching and label co-annotation to address tasks with non-overlapping\nannotations, encoding prior knowledge of their relatedness. In the largest\nstudy of its kind, Behavior4All outperforms both state-of-the-art and toolkits\nin overall performance as well as fairness across all databases and tasks. It\nalso demonstrates superior generalizability on unseen databases and on compound\nexpression recognition. Finally, Behavior4All is way times faster than other\ntoolkits.\n","authors":["Dimitrios Kollias","Chunchang Shao","Odysseus Kaloidas","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2409.17717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00230v3","updated":"2024-09-26T10:27:58Z","published":"2024-03-30T03:19:50Z","title":"Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space","summary":" Watermarking is a tool for actively identifying and attributing the images\ngenerated by latent diffusion models. Existing methods face the dilemma of\nimage quality and watermark robustness. Watermarks with superior image quality\nusually have inferior robustness against attacks such as blurring and JPEG\ncompression, while watermarks with superior robustness usually significantly\ndamage image quality. This dilemma stems from the traditional paradigm where\nwatermarks are injected and detected in pixel space, relying on pixel\nperturbation for watermark detection and resilience against attacks. In this\npaper, we highlight that an effective solution to the problem is to both inject\nand detect watermarks in the latent diffusion space, and propose Latent\nWatermark with a progressive training strategy. It weakens the direct\nconnection between quality and robustness and thus alleviates their\ncontradiction. We conduct evaluations on two datasets and against 10 watermark\nattacks. Six metrics measure the image quality and watermark robustness.\nResults show that compared to the recently proposed methods such as\nStableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not\nonly surpasses them in terms of robustness but also offers superior image\nquality. Our code will be available at\nhttps://github.com/RichardSunnyMeng/LatentWatermark.\n","authors":["Zheling Meng","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05024v2","updated":"2024-09-26T09:55:49Z","published":"2024-09-08T08:33:32Z","title":"Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels","summary":" Medical image segmentation is crucial in the field of medical imaging, aiding\nin disease diagnosis and surgical planning. Most established segmentation\nmethods rely on supervised deep learning, in which clean and precise labels are\nessential for supervision and significantly impact the performance of models.\nHowever, manually delineated labels often contain noise, such as missing labels\nand inaccurate boundary delineation, which can hinder networks from correctly\nmodeling target characteristics. In this paper, we propose a deep\nself-cleansing segmentation framework that can preserve clean labels while\ncleansing noisy ones in the training phase. To achieve this, we devise a\ngaussian mixture model-based label filtering module that distinguishes noisy\nlabels from clean labels. Additionally, we develop a label cleansing module to\ngenerate pseudo low-noise labels for identified noisy samples. The preserved\nclean labels and pseudo-labels are then used jointly to supervise the network.\nValidated on a clinical liver tumor dataset and a public cardiac diagnosis\ndataset, our method can effectively suppress the interference from noisy labels\nand achieve prominent segmentation performance.\n","authors":["Jiahua Dong","Yue Zhang","Qiuli Wang","Ruofeng Tong","Shihong Ying","Shaolin Gong","Xuanpu Zhang","Lanfen Lin","Yen-Wei Chen","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.05024v2.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17686v1","updated":"2024-09-26T09:51:11Z","published":"2024-09-26T09:51:11Z","title":"MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling","summary":" Motion generation from discrete quantization offers many advantages over\ncontinuous regression, but at the cost of inevitable approximation errors.\nPrevious methods usually quantize the entire body pose into one code, which not\nonly faces the difficulty in encoding all joints within one vector but also\nloses the spatial relationship between different joints. Differently, in this\nwork we quantize each individual joint into one vector, which i) simplifies the\nquantization process as the complexity associated with a single joint is\nmarkedly lower than that of the entire pose; ii) maintains a spatial-temporal\nstructure that preserves both the spatial relationships among joints and the\ntemporal movement patterns; iii) yields a 2D token map, which enables the\napplication of various 2D operations widely used in 2D images. Grounded in the\n2D motion quantization, we build a spatial-temporal modeling framework, where\n2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D\nattention are proposed to take advantage of spatial-temporal signals among the\n2D tokens. Extensive experiments demonstrate that our method significantly\noutperforms previous methods across different datasets, with a $26.6\\%$\ndecrease of FID on HumanML3D and a $29.9\\%$ decrease on KIT-ML.\n","authors":["Weihao Yuan","Weichao Shen","Yisheng He","Yuan Dong","Xiaodong Gu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17686v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17682v1","updated":"2024-09-26T09:48:24Z","published":"2024-09-26T09:48:24Z","title":"Dark Miner: Defend against unsafe generation for text-to-image diffusion\n models","summary":" Text-to-image diffusion models have been demonstrated with unsafe generation\ndue to unfiltered large-scale training data, such as violent, sexual, and\nshocking images, necessitating the erasure of unsafe concepts. Most existing\nmethods focus on modifying the generation probabilities conditioned on the\ntexts containing unsafe descriptions. However, they fail to guarantee safe\ngeneration for unseen texts in the training phase, especially for the prompts\nfrom adversarial attacks. In this paper, we re-analyze the erasure task and\npoint out that existing methods cannot guarantee the minimization of the total\nprobabilities of unsafe generation. To tackle this problem, we propose Dark\nMiner. It entails a recurring three-stage process that comprises mining,\nverifying, and circumventing. It greedily mines embeddings with maximum\ngeneration probabilities of unsafe concepts and reduces unsafe generation more\neffectively. In the experiments, we evaluate its performance on two\ninappropriate concepts, two objects, and two styles. Compared with 6 previous\nstate-of-the-art methods, our method achieves better erasure and defense\nresults in most cases, especially under 4 state-of-the-art attacks, while\npreserving the model's native generation capability. Our code will be available\non GitHub.\n","authors":["Zheling Meng","Bo Peng","Xiaochuan Jin","Yue Jiang","Jing Dong","Wei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2409.17682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17680v1","updated":"2024-09-26T09:43:50Z","published":"2024-09-26T09:43:50Z","title":"Event-based Stereo Depth Estimation: A Survey","summary":" Stereopsis has widespread appeal in robotics as it is the predominant way by\nwhich living beings perceive depth to navigate our 3D world. Event cameras are\nnovel bio-inspired sensors that detect per-pixel brightness changes\nasynchronously, with very high temporal resolution and high dynamic range,\nenabling machine perception in high-speed motion and broad illumination\nconditions. The high temporal precision also benefits stereo matching, making\ndisparity (depth) estimation a popular research area for event cameras ever\nsince its inception. Over the last 30 years, the field has evolved rapidly,\nfrom low-latency, low-power circuit design to current deep learning (DL)\napproaches driven by the computer vision community. The bibliography is vast\nand difficult to navigate for non-experts due its highly interdisciplinary\nnature. Past surveys have addressed distinct aspects of this topic, in the\ncontext of applications, or focusing only on a specific class of techniques,\nbut have overlooked stereo datasets. This survey provides a comprehensive\noverview, covering both instantaneous stereo and long-term methods suitable for\nsimultaneous localization and mapping (SLAM), along with theoretical and\nempirical comparisons. It is the first to extensively review DL methods as well\nas stereo datasets, even providing practical suggestions for creating new\nbenchmarks to advance the field. The main advantages and challenges faced by\nevent-based stereo depth estimation are also discussed. Despite significant\nprogress, challenges remain in achieving optimal performance in not only\naccuracy but also efficiency, a cornerstone of event-based computing. We\nidentify several gaps and propose future research directions. We hope this\nsurvey inspires future research in this area, by serving as an accessible entry\npoint for newcomers, as well as a practical guide for seasoned researchers in\nthe community.\n","authors":["Suman Ghosh","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.17680v1.pdf","comment":"28 pages, 20 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.17675v1","updated":"2024-09-26T09:34:33Z","published":"2024-09-26T09:34:33Z","title":"EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D\n Medical Image Segmentation","summary":" Convolutional neural networks have primarily led 3D medical image\nsegmentation but may be limited by small receptive fields. Transformer models\nexcel in capturing global relationships through self-attention but are\nchallenged by high computational costs at high resolutions. Recently, Mamba, a\nstate space model, has emerged as an effective approach for sequential\nmodeling. Inspired by its success, we introduce a novel Mamba-based 3D medical\nimage segmentation model called EM-Net. It not only efficiently captures\nattentive interaction between regions by integrating and selecting channels,\nbut also effectively utilizes frequency domain to harmonize the learning of\nfeatures across varying scales, while accelerating training speed.\nComprehensive experiments on two challenging multi-organ datasets with other\nstate-of-the-art (SOTA) algorithms show that our method exhibits better\nsegmentation accuracy while requiring nearly half the parameter size of SOTA\nmodels and 2x faster training speed.\n","authors":["Ao Chang","Jiajun Zeng","Ruobing Huang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2409.17675v1.pdf","comment":"10 pages, 3 figures, accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.17674v1","updated":"2024-09-26T09:33:20Z","published":"2024-09-26T09:33:20Z","title":"Self-Supervised Learning of Deviation in Latent Representation for\n Co-speech Gesture Video Generation","summary":" Gestures are pivotal in enhancing co-speech communication. While recent works\nhave mostly focused on point-level motion transformation or fully supervised\nmotion representations through data-driven approaches, we explore the\nrepresentation of gestures in co-speech, with a focus on self-supervised\nrepresentation and pixel-level motion deviation, utilizing a diffusion model\nwhich incorporates latent motion features. Our approach leverages\nself-supervised deviation in latent representation to facilitate hand gestures\ngeneration, which are crucial for generating realistic gesture videos. Results\nof our first experiment demonstrate that our method enhances the quality of\ngenerated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD,\nand 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods.\n","authors":["Huan Yang","Jiahui Chen","Chaofan Ding","Runhua Shi","Siyu Xiong","Qingqi Hong","Xiaoqi Mo","Xinhan Di"],"pdf_url":"https://arxiv.org/pdf/2409.17674v1.pdf","comment":"5 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2404.09486v2","updated":"2024-09-26T09:31:48Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Benchmarking Multimodal Large Language Models for Code\n Generation with Visually Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/likaixin2000/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Zhiyong Huang","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17671v1","updated":"2024-09-26T09:30:37Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape of a person does not change within a single video.\nHowever, most SOTA human mesh estimation (HME) models output a slightly\ndifferent body shape for each video frame, which results in inconsistent body\nshapes for the same person. In contrast, we leverage anthropometric\nmeasurements like tailors are already obtaining from humans for centuries. We\ncreate a model called A2B that converts such anthropometric measurements to\nbody shape parameters of human mesh models. Moreover, we find that finetuned\nSOTA 3D human pose estimation (HPE) models outperform HME models regarding the\nprecision of the estimated keypoints. We show that applying inverse kinematics\n(IK) to the results of such a 3D HPE model and combining the resulting body\npose with the A2B body shape leads to superior and consistent human meshes for\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing HME models estimates of\nthe body shape parameters with A2B model results not only increases the\nperformance of these HME models, but also leads to consistent body shapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2402.18411v3","updated":"2024-09-26T09:00:00Z","published":"2024-02-28T15:31:45Z","title":"Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal\n Transport","summary":" Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images\nsharing the same category across diverse domains without relying on labeled\ndata. Prior approaches have typically decomposed the UCIR problem into two\ndistinct tasks: intra-domain representation learning and cross-domain feature\nalignment. However, these segregated strategies overlook the potential\nsynergies between these tasks. This paper introduces ProtoOT, a novel Optimal\nTransport formulation explicitly tailored for UCIR, which integrates\nintra-domain feature representation learning and cross-domain alignment into a\nunified framework. ProtoOT leverages the strengths of the K-means clustering\nmethod to effectively manage distribution imbalances inherent in UCIR. By\nutilizing K-means for generating initial prototypes and approximating class\nmarginal distributions, we modify the constraints in Optimal Transport\naccordingly, significantly enhancing its performance in UCIR scenarios.\nFurthermore, we incorporate contrastive learning into the ProtoOT framework to\nfurther improve representation learning. This encourages local semantic\nconsistency among features with similar semantics, while also explicitly\nenforcing separation between features and unmatched prototypes, thereby\nenhancing global discriminativeness. ProtoOT surpasses existing\nstate-of-the-art methods by a notable margin across benchmark datasets.\nNotably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%,\nand on Office-Home, it demonstrates a P@15 improvement of 3.83%.\n","authors":["Bin Li","Ye Shi","Qian Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18411v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v5","updated":"2024-09-26T08:57:49Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v5.pdf","comment":"Accepted by WIFS 2024"},{"id":"http://arxiv.org/abs/2409.17649v1","updated":"2024-09-26T08:55:44Z","published":"2024-09-26T08:55:44Z","title":"Provable Performance Guarantees of Copy Detection Patterns","summary":" Copy Detection Patterns (CDPs) are crucial elements in modern security\napplications, playing a vital role in safeguarding industries such as food,\npharmaceuticals, and cosmetics. Current performance evaluations of CDPs\npredominantly rely on empirical setups using simplistic metrics like Hamming\ndistances or Pearson correlation. These methods are often inadequate due to\ntheir sensitivity to distortions, degradation, and their limitations to\nstationary statistics of printing and imaging. Additionally, machine\nlearning-based approaches suffer from distribution biases and fail to\ngeneralize to unseen counterfeit samples. Given the critical importance of CDPs\nin preventing counterfeiting, including the counterfeit vaccines issue\nhighlighted during the COVID-19 pandemic, there is an urgent need for provable\nperformance guarantees across various criteria. This paper aims to establish a\ntheoretical framework to derive optimal criteria for the analysis,\noptimization, and future development of CDP authentication technologies,\nensuring their reliability and effectiveness in diverse security scenarios.\n","authors":["Joakim Tutt","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17647v1","updated":"2024-09-26T08:51:29Z","published":"2024-09-26T08:51:29Z","title":"MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning","summary":" Video causal reasoning aims to achieve a high-level understanding of video\ncontent from a causal perspective. However, current video reasoning tasks are\nlimited in scope, primarily executed in a question-answering paradigm and\nfocusing on short videos containing only a single event and simple causal\nrelationships, lacking comprehensive and structured causality analysis for\nvideos with multiple events. To fill this gap, we introduce a new task and\ndataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal\nrelationships between events distributed chronologically across long videos.\nGiven visual segments and textual descriptions of events, MECD requires\nidentifying the causal associations between these events to derive a\ncomprehensive, structured event-level video causal diagram explaining why and\nhow the final result event occurred. To address MECD, we devise a novel\nframework inspired by the Granger Causality method, using an efficient\nmask-based event prediction model to perform an Event Granger Test, which\nestimates causality by comparing the predicted result event when premise events\nare masked versus unmasked. Furthermore, we integrate causal inference\ntechniques such as front-door adjustment and counterfactual inference to\naddress challenges in MECD like causality confounding and illusory causality.\nExperiments validate the effectiveness of our framework in providing causal\nrelationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by\n5.7% and 4.1%, respectively.\n","authors":["Tieyuan Chen","Huabin Liu","Tianyao He","Yihang Chen","Chaofan Gan","Xiao Ma","Cheng Zhong","Yang Zhang","Yingxue Wang","Hui Lin","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2409.17647v1.pdf","comment":"Accepted at NeurIPS 2024 as a spotlight paper"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17634v1","updated":"2024-09-26T08:31:27Z","published":"2024-09-26T08:31:27Z","title":"P4Q: Learning to Prompt for Quantization in Visual-language Models","summary":" Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence\nin various visual and multimodal tasks, yet the deployment of VLMs on\ndownstream application platforms remains challenging due to their prohibitive\nrequirements of training samples and computing resources. Fine-tuning and\nquantization of VLMs can substantially reduce the sample and computation costs,\nwhich are in urgent need. There are two prevailing paradigms in quantization,\nQuantization-Aware Training (QAT) can effectively quantize large-scale VLMs but\nincur a huge training cost, while low-bit Post-Training Quantization (PTQ)\nsuffers from a notable performance drop. We propose a method that balances\nfine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which\nwe design a lightweight architecture to leverage contrastive loss supervision\nto enhance the recognition performance of a PTQ model. Our method can\neffectively reduce the gap between image features and text features caused by\nlow-bit quantization, based on learnable prompts to reorganize textual\nrepresentations and a low-bit adapter to realign the distributions of image and\ntext features. We also introduce a distillation loss based on cosine similarity\npredictions to distill the quantized model using a full-precision teacher.\nExtensive experimental results demonstrate that our P4Q method outperforms\nprior arts, even achieving comparable results to its full-precision\ncounterparts. For instance, our 8-bit P4Q can theoretically compress the\nCLIP-ViT/B-32 by 4 $\\times$ while achieving 66.94\\% Top-1 accuracy,\noutperforming the learnable prompt fine-tuned full-precision model by 2.24\\%\nwith negligible additional parameters on the ImageNet dataset.\n","authors":["Huixin Sun","Runqi Wang","Yanjing Li","Xianbin Cao","Xiaolong Jiang","Yao Hu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16723v2","updated":"2024-09-26T08:28:48Z","published":"2024-09-25T08:22:00Z","title":"EAGLE: Towards Efficient Arbitrary Referring Visual Prompts\n Comprehension for Multimodal Large Language Models","summary":" Recently, Multimodal Large Language Models (MLLMs) have sparked great\nresearch interests owing to their exceptional content-reasoning and\ninstruction-following capabilities. To effectively instruct an MLLM, in\naddition to conventional language expressions, the practice of referring to\nobjects by painting with brushes on images has emerged as a prevalent tool\n(referred to as \"referring visual prompts\") due to its efficacy in aligning the\nuser's intention with specific image regions. To accommodate the most common\nreferring visual prompts, namely points, boxes, and masks, existing approaches\ninitially utilize specialized feature encoding modules to capture the semantics\nof the highlighted areas indicated by these prompts. Subsequently, these\nencoded region features are adapted to MLLMs through fine-tuning on a\nmeticulously curated multimodal instruction dataset. However, such designs\nsuffer from redundancy in architecture. Moreover, they face challenges in\neffectively generalizing when encountering a diverse range of arbitrary\nreferring visual prompts in real-life scenarios. To address the above issues,\nwe propose EAGLE, a novel MLLM that empowers comprehension of arbitrary\nreferring visual prompts with less training efforts than existing approaches.\nSpecifically, our EAGLE maintains the innate format of the referring visual\nprompts as colored patches rendered on the given image for conducting the\ninstruction tuning. Our approach embeds referring visual prompts as spatial\nconcepts conveying specific spatial areas comprehensible to the MLLM, with the\nsemantic comprehension of these regions originating from the MLLM itself.\nBesides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further\ndisentangle the MLLM's region-level comprehension with the specific formats of\nreferring visual prompts. Extensive experiments are conducted to prove the\neffectiveness of our proposed method.\n","authors":["Jiacheng Zhang","Yang Jiao","Shaoxiang Chen","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.16723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17629v1","updated":"2024-09-26T08:23:04Z","published":"2024-09-26T08:23:04Z","title":"Hand-object reconstruction via interaction-aware graph attention\n mechanism","summary":" Estimating the poses of both a hand and an object has become an important\narea of research due to the growing need for advanced vision computing. The\nprimary challenge involves understanding and reconstructing how hands and\nobjects interact, such as contact and physical plausibility. Existing\napproaches often adopt a graph neural network to incorporate spatial\ninformation of hand and object meshes. However, these approaches have not fully\nexploited the potential of graphs without modification of edges within and\nbetween hand- and object-graphs. We propose a graph-based refinement method\nthat incorporates an interaction-aware graph-attention mechanism to account for\nhand-object interactions. Using edges, we establish connections among closely\ncorrelated nodes, both within individual graphs and across different graphs.\nExperiments demonstrate the effectiveness of our proposed method with notable\nimprovements in the realm of physical plausibility.\n","authors":["Taeyun Woo","Tae-Kyun Kim","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2409.17629v1.pdf","comment":"7 pages, Accepted by ICIP 2024"},{"id":"http://arxiv.org/abs/2405.17251v2","updated":"2024-09-26T08:22:52Z","published":"2024-05-27T15:07:04Z","title":"GenWarp: Single Image to Novel Views with Semantic-Preserving Generative\n Warping","summary":" Generating novel views from a single image remains a challenging task due to\nthe complexity of 3D scenes and the limited diversity in the existing\nmulti-view datasets to train a model on. Recent research combining large-scale\ntext-to-image (T2I) models with monocular depth estimation (MDE) has shown\npromise in handling in-the-wild images. In these methods, an input view is\ngeometrically warped to novel views with estimated depth maps, then the warped\nimage is inpainted by T2I models. However, they struggle with noisy depth maps\nand loss of semantic details when warping an input view to novel viewpoints. In\nthis paper, we propose a novel approach for single-shot novel view synthesis, a\nsemantic-preserving generative warping framework that enables T2I generative\nmodels to learn where to warp and where to generate, through augmenting\ncross-view attention with self-attention. Our approach addresses the\nlimitations of existing methods by conditioning the generative model on source\nview images and incorporating geometric warping signals. Qualitative and\nquantitative evaluations demonstrate that our model outperforms existing\nmethods in both in-domain and out-of-domain scenarios. Project page is\navailable at https://GenWarp-NVS.github.io/.\n","authors":["Junyoung Seo","Kazumi Fukuda","Takashi Shibuya","Takuya Narihira","Naoki Murata","Shoukang Hu","Chieh-Hsin Lai","Seungryong Kim","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2405.17251v2.pdf","comment":"Accepted to NeurIPS 2024 / Project page:\n https://GenWarp-NVS.github.io"},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14220v2","updated":"2024-09-26T08:13:43Z","published":"2024-09-21T18:52:07Z","title":"Masks and Boxes: Combining the Best of Both Worlds for Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) involves identifying and consistently tracking\nobjects across video sequences. Traditional tracking-by-detection methods,\nwhile effective, often require extensive tuning and lack generalizability. On\nthe other hand, segmentation mask-based methods are more generic but struggle\nwith tracking management, making them unsuitable for MOT. We propose a novel\napproach, McByte, which incorporates a temporally propagated segmentation mask\nas a strong association cue within a tracking-by-detection framework. By\ncombining bounding box and mask information, McByte enhances robustness and\ngeneralizability without per-sequence tuning. Evaluated on four benchmark\ndatasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking -\nMcByte demonstrates performance gain in all cases examined. At the same time,\nit outperforms existing mask-based methods. Implementation code will be\nprovided upon acceptance.\n","authors":["Tomasz Stanczyk","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2409.14220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13818v3","updated":"2024-09-26T08:07:16Z","published":"2024-08-25T12:22:50Z","title":"HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images\n Using Deep Learning","summary":" The current standard for detecting human epidermal growth factor receptor 2\n(HER2) status in breast cancer patients relies on HER2 amplification,\nidentified through fluorescence in situ hybridization (FISH) or\nimmunohistochemistry (IHC). However, hematoxylin and eosin (H\\&E) tumor stains\nare more widely available, and accurately predicting HER2 status using H\\&E\ncould reduce costs and expedite treatment selection. Deep Learning algorithms\nfor H&E have shown effectiveness in predicting various cancer features and\nclinical outcomes, including moderate success in HER2 status prediction. In\nthis work, we employed a customized weak supervision classification technique\ncombined with MoCo-v2 contrastive learning to predict HER2 status. We trained\nour pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The\nCancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale\nSchool of Medicine are publicly available. Our pipeline achieved an Area Under\nthe Curve (AUC) of 0.85 across four different test folds. Additionally, we\ntested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2\nscore of 2+ and included corresponding HER2 status and FISH test results. These\ncases are considered equivocal for IHC, requiring an expensive FISH test on\ntheir IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81\non these challenging H&E slides. Reducing the need for FISH test can have\nsignificant implications in cancer treatment equity for underserved\npopulations.\n","authors":["Ardhendu Sekhar","Vrinda Goel","Garima Jain","Abhijeet Patil","Ravi Kant Gupta","Tripti Bameta","Swapnil Rane","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2408.13818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06190v2","updated":"2024-09-26T07:56:50Z","published":"2024-08-12T14:40:38Z","title":"FruitNeRF: A Unified Neural Radiance Field based Fruit Counting\n Framework","summary":" We introduce FruitNeRF, a unified novel fruit counting framework that\nleverages state-of-the-art view synthesis methods to count any fruit type\ndirectly in 3D. Our framework takes an unordered set of posed images captured\nby a monocular camera and segments fruit in each image. To make our system\nindependent of the fruit type, we employ a foundation model that generates\nbinary segmentation masks for any fruit. Utilizing both modalities, RGB and\nsemantic, we train a semantic neural radiance field. Through uniform volume\nsampling of the implicit Fruit Field, we obtain fruit-only point clouds. By\napplying cascaded clustering on the extracted point cloud, our approach\nachieves precise fruit count.The use of neural radiance fields provides\nsignificant advantages over conventional methods such as object tracking or\noptical flow, as the counting itself is lifted into 3D. Our method prevents\ndouble counting fruit and avoids counting irrelevant fruit.We evaluate our\nmethodology using both real-world and synthetic datasets. The real-world\ndataset consists of three apple trees with manually counted ground truths, a\nbenchmark apple dataset with one row and ground truth fruit location, while the\nsynthetic dataset comprises various fruit types including apple, plum, lemon,\npear, peach, and mango.Additionally, we assess the performance of fruit\ncounting using the foundation model compared to a U-Net.\n","authors":["Lukas Meyer","Andreas Gilson","Ute Schmid","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2408.06190v2.pdf","comment":"Project Page: https://meyerls.github.io/fruit_nerf/"},{"id":"http://arxiv.org/abs/2409.17610v1","updated":"2024-09-26T07:55:57Z","published":"2024-09-26T07:55:57Z","title":"ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context\n Information in Multi-Turn Multimodal Medical Dialogue","summary":" The rocketing prosperity of large language models (LLMs) in recent years has\nboosted the prevalence of vision-language models (VLMs) in the medical sector.\nIn our online medical consultation scenario, a doctor responds to the texts and\nimages provided by a patient in multiple rounds to diagnose her/his health\ncondition, forming a multi-turn multimodal medical dialogue format. Unlike\nhigh-quality images captured by professional equipment in traditional medical\nvisual question answering (Med-VQA), the images in our case are taken by\npatients' mobile phones. These images have poor quality control, with issues\nsuch as excessive background elements and the lesion area being significantly\noff-center, leading to degradation of vision-language alignment in the model\ntraining phase. In this paper, we propose ZALM3, a Zero-shot strategy to\nimprove vision-language ALignment in Multi-turn Multimodal Medical dialogue.\nSince we observe that the preceding text conversations before an image can\ninfer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to\nsummarize the keywords from the preceding context and a visual grounding model\nto extract the RoIs. The updated images eliminate unnecessary background noise\nand provide more effective vision-language alignment. To better evaluate our\nproposed method, we design a new subjective assessment metric for multi-turn\nunimodal/multimodal medical dialogue to provide a fine-grained performance\ncomparison. Our experiments across three different clinical departments\nremarkably demonstrate the efficacy of ZALM3 with statistical significance.\n","authors":["Zhangpu Li","Changhong Zou","Suxue Ma","Zhicheng Yang","Chen Du","Youbao Tang","Zhenjie Cao","Ning Zhang","Jui-Hsin Lai","Ruei-Sung Lin","Yuan Ni","Xingzhi Sun","Jing Xiao","Kai Zhang","Mei Han"],"pdf_url":"https://arxiv.org/pdf/2409.17610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17608v1","updated":"2024-09-26T07:48:20Z","published":"2024-09-26T07:48:20Z","title":"Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for\n Video Anomaly Detection","summary":" Video anomaly detection (VAD) often learns the distribution of normal samples\nand detects the anomaly through measuring significant deviations, but the\nundesired generalization may reconstruct a few anomalies thus suppressing the\ndeviations. Meanwhile, most VADs cannot cope with cross-dataset validation for\nnew target domains, and few-shot methods must laboriously rely on model-tuning\nfrom the target domain to complete domain adaptation. To address these\nproblems, we propose a novel VAD method with a motion-guided memory module to\nachieve cross-dataset validation with zero-shot. First, we add Gaussian blur to\nthe raw appearance images, thereby constructing the global pseudo-anomaly,\nwhich serves as the input to the network. Then, we propose multi-scale residual\nchannel attention to deblur the pseudo-anomaly in normal samples. Next, memory\nitems are obtained by recording the motion features in the training phase,\nwhich are used to retrieve the motion features from the raw information in the\ntesting phase. Lastly, our method can ignore the blurred real anomaly through\nattention and rely on motion memory items to increase the normality gap between\nnormal and abnormal motion. Extensive experiments on three benchmark datasets\ndemonstrate the effectiveness of the proposed method. Compared with\ncross-domain methods, our method achieves competitive performance without\nadaptation during testing.\n","authors":["Jiahao Lyu","Minghua Zhao","Jing Hu","Xuewen Huang","Shuangli Du","Cheng Shi","Zhiyong Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17608v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17601v1","updated":"2024-09-26T07:35:23Z","published":"2024-09-26T07:35:23Z","title":"TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for\n Multimodal Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17597v1","updated":"2024-09-26T07:24:09Z","published":"2024-09-26T07:24:09Z","title":"Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image\n Super-Resolution","summary":" Window-based transformers have demonstrated outstanding performance in\nsuper-resolution tasks due to their adaptive modeling capabilities through\nlocal self-attention (SA). However, they exhibit higher computational\ncomplexity and inference latency than convolutional neural networks. In this\npaper, we first identify that the adaptability of the Transformers is derived\nfrom their adaptive spatial aggregation and advanced structural design, while\ntheir high latency results from the computational costs and memory layout\ntransformations associated with the local SA. To simulate this aggregation\napproach, we propose an effective convolution-based linear focal separable\nattention (FSA), allowing for long-range dynamic modeling with linear\ncomplexity. Additionally, we introduce an effective dual-branch structure\ncombined with an ultra-lightweight information exchange module (IEM) to enhance\nthe aggregation of information by the Token Mixer. Finally, with respect to the\nstructure, we modify the existing spatial-gate-based feedforward neural\nnetworks by incorporating a self-gate mechanism to preserve high-dimensional\nchannel information, enabling the modeling of more complex relationships. With\nthese advancements, we construct a convolution-based Transformer framework\nnamed the linear adaptive mixer network (LAMNet). Extensive experiments\ndemonstrate that LAMNet achieves better performance than existing SA-based\nTransformer methods while maintaining the computational efficiency of\nconvolutional neural networks, which can achieve a \\(3\\times\\) speedup of\ninference time. The code will be publicly available at:\nhttps://github.com/zononhzy/LAMNet.\n","authors":["Zhenyu Hu","Wanjie Sun"],"pdf_url":"https://arxiv.org/pdf/2409.17597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17937v3","updated":"2024-09-26T07:23:49Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v3.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2409.17589v1","updated":"2024-09-26T07:12:04Z","published":"2024-09-26T07:12:04Z","title":"Improving Fast Adversarial Training via Self-Knowledge Guidance","summary":" Adversarial training has achieved remarkable advancements in defending\nagainst adversarial attacks. Among them, fast adversarial training (FAT) is\ngaining attention for its ability to achieve competitive robustness with fewer\ncomputing resources. Existing FAT methods typically employ a uniform strategy\nthat optimizes all training data equally without considering the influence of\ndifferent examples, which leads to an imbalanced optimization. However, this\nimbalance remains unexplored in the field of FAT. In this paper, we conduct a\ncomprehensive study of the imbalance issue in FAT and observe an obvious class\ndisparity regarding their performances. This disparity could be embodied from a\nperspective of alignment between clean and robust accuracy. Based on the\nanalysis, we mainly attribute the observed misalignment and disparity to the\nimbalanced optimization in FAT, which motivates us to optimize different\ntraining data adaptively to enhance robustness. Specifically, we take disparity\nand misalignment into consideration. First, we introduce self-knowledge guided\nregularization, which assigns differentiated regularization weights to each\nclass based on its training state, alleviating class disparity. Additionally,\nwe propose self-knowledge guided label relaxation, which adjusts label\nrelaxation according to the training accuracy, alleviating the misalignment and\nimproving robustness. By combining these methods, we formulate the\nSelf-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge\nduring training to enhance the adversarial robustness without compromising\ntraining efficiency. Extensive experiments on four standard datasets\ndemonstrate that the SKG-FAT improves the robustness and preserves competitive\nclean accuracy, outperforming the state-of-the-art methods.\n","authors":["Chengze Jiang","Junkai Wang","Minjing Dong","Jie Gui","Xinli Shi","Yuan Cao","Yuan Yan Tang","James Tin-Yau Kwok"],"pdf_url":"https://arxiv.org/pdf/2409.17589v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17576v1","updated":"2024-09-26T06:46:40Z","published":"2024-09-26T06:46:40Z","title":"ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for\n Synthetic Face Recognition","summary":" Synthetic face recognition (SFR) aims to generate synthetic face datasets\nthat mimic the distribution of real face data, which allows for training face\nrecognition models in a privacy-preserving manner. Despite the remarkable\npotential of diffusion models in image generation, current diffusion-based SFR\nmodels struggle with generalization to real-world faces. To address this\nlimitation, we outline three key objectives for SFR: (1) promoting diversity\nacross identities (inter-class diversity), (2) ensuring diversity within each\nidentity by injecting various facial attributes (intra-class diversity), and\n(3) maintaining identity consistency within each identity group (intra-class\nidentity preservation). Inspired by these goals, we introduce a\ndiffusion-fueled SFR model termed $\\text{ID}^3$. $\\text{ID}^3$ employs an\nID-preserving loss to generate diverse yet identity-consistent facial\nappearances. Theoretically, we show that minimizing this loss is equivalent to\nmaximizing the lower bound of an adjusted conditional log-likelihood over\nID-preserving data. This equivalence motivates an ID-preserving sampling\nalgorithm, which operates over an adjusted gradient vector field, enabling the\ngeneration of fake face recognition datasets that approximate the distribution\nof real-world faces. Extensive experiments across five challenging benchmarks\nvalidate the advantages of $\\text{ID}^3$.\n","authors":["Shen Li","Jianqing Xu","Jiaying Wu","Miao Xiong","Ailin Deng","Jiazhen Ji","Yuge Huang","Wenjie Feng","Shouhong Ding","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2409.17576v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.12598v2","updated":"2024-09-26T06:31:25Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Deflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17566v1","updated":"2024-09-26T06:28:05Z","published":"2024-09-26T06:28:05Z","title":"Flexiffusion: Segment-wise Neural Architecture Search for Flexible\n Denoising Schedule","summary":" Diffusion models are cutting-edge generative models adept at producing\ndiverse, high-quality images. Despite their effectiveness, these models often\nrequire significant computational resources owing to their numerous sequential\ndenoising steps and the significant inference cost of each step. Recently,\nNeural Architecture Search (NAS) techniques have been employed to automatically\nsearch for faster generation processes. However, NAS for diffusion is\ninherently time-consuming as it requires estimating thousands of diffusion\nmodels to search for the optimal one. In this paper, we introduce Flexiffusion,\na novel training-free NAS paradigm designed to accelerate diffusion models by\nconcurrently optimizing generation steps and network structures. Specifically,\nwe partition the generation process into isometric step segments, each\nsequentially composed of a full step, multiple partial steps, and several null\nsteps. The full step computes all network blocks, while the partial step\ninvolves part of the blocks, and the null step entails no computation.\nFlexiffusion autonomously explores flexible step combinations for each segment,\nsubstantially reducing search costs and enabling greater acceleration compared\nto the state-of-the-art (SOTA) method for diffusion models. Our searched models\nreported speedup factors of $2.6\\times$ and $1.5\\times$ for the original\nLDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and\nthe SOTA are $5.1\\times$ and $2.0\\times$. We also verified the performance of\nFlexiffusion on multiple datasets, and positive experiment results indicate\nthat Flexiffusion can effectively reduce redundancy in diffusion models.\n","authors":["Hongtao Huang","Xiaojun Chang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.17566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17564v1","updated":"2024-09-26T06:27:15Z","published":"2024-09-26T06:27:15Z","title":"General Compression Framework for Efficient Transformer Object Tracking","summary":" Transformer-based trackers have established a dominant role in the field of\nvisual object tracking. While these trackers exhibit promising performance,\ntheir deployment on resource-constrained devices remains challenging due to\ninefficiencies. To improve the inference efficiency and reduce the computation\ncost, prior approaches have aimed to either design lightweight trackers or\ndistill knowledge from larger teacher models into more compact student\ntrackers. However, these solutions often sacrifice accuracy for speed. Thus, we\npropose a general model compression framework for efficient transformer object\ntracking, named CompressTracker, to reduce the size of a pre-trained tracking\nmodel into a lightweight tracker with minimal performance degradation. Our\napproach features a novel stage division strategy that segments the transformer\nlayers of the teacher model into distinct stages, enabling the student model to\nemulate each corresponding teacher stage more effectively. Additionally, we\nalso design a unique replacement training technique that involves randomly\nsubstituting specific stages in the student model with those from the teacher\nmodel, as opposed to training the student model in isolation. Replacement\ntraining enhances the student model's ability to replicate the teacher model's\nbehavior. To further forcing student model to emulate teacher model, we\nincorporate prediction guidance and stage-wise feature mimicking to provide\nadditional supervision during the teacher model's compression process. Our\nframework CompressTracker is structurally agnostic, making it compatible with\nany transformer architecture. We conduct a series of experiment to verify the\neffectiveness and generalizability of CompressTracker. Our CompressTracker-4\nwith 4 transformer layers, which is compressed from OSTrack, retains about 96%\nperformance on LaSOT (66.1% AUC) while achieves 2.17x speed up.\n","authors":["Lingyi Hong","Jinglun Li","Xinyu Zhou","Shilin Yan","Pinxue Guo","Kaixun Jiang","Zhaoyu Chen","Shuyong Gao","Wei Zhang","Hong Lu","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17560v1","updated":"2024-09-26T06:12:08Z","published":"2024-09-26T06:12:08Z","title":"Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse\n Attention for RGB-E Tracking","summary":" Event-based bionic camera asynchronously captures dynamic scenes with high\ntemporal resolution and high dynamic range, offering potential for the\nintegration of events and RGB under conditions of illumination degradation and\nfast motion. Existing RGB-E tracking methods model event characteristics\nutilising attention mechanism of Transformer before integrating both\nmodalities. Nevertheless, these methods involve aggregating the event stream\ninto a single event frame, lacking the utilisation of the temporal information\ninherent in the event stream.Moreover, the traditional attention mechanism is\nwell-suited for dense semantic features, while the attention mechanism for\nsparse event features require revolution. In this paper, we propose a dynamic\nevent subframe splitting strategy to split the event stream into more\nfine-grained event clusters, aiming to capture spatio-temporal features that\ncontain motion cues. Based on this, we design an event-based sparse attention\nmechanism to enhance the interaction of event features in temporal and spatial\ndimensions. The experimental results indicate that our method outperforms\nexisting state-of-the-art methods on the FE240 and COESOT datasets, providing\nan effective processing manner for the event data.\n","authors":["Pengcheng Shao","Tianyang Xu","Xuefeng Zhu","Xiaojun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2409.17560v1.pdf","comment":"15 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2409.16225v2","updated":"2024-09-26T06:04:21Z","published":"2024-09-24T16:38:41Z","title":"VideoPatchCore: An Effective Method to Memorize Normality for Video\n Anomaly Detection","summary":" Video anomaly detection (VAD) is a crucial task in video analysis and\nsurveillance within computer vision. Currently, VAD is gaining attention with\nmemory techniques that store the features of normal frames. The stored features\nare utilized for frame reconstruction, identifying an abnormality when a\nsignificant difference exists between the reconstructed and input frames.\nHowever, this approach faces several challenges due to the simultaneous\noptimization required for both the memory and encoder-decoder model. These\nchallenges include increased optimization difficulty, complexity of\nimplementation, and performance variability depending on the memory size. To\naddress these challenges,we propose an effective memory method for VAD, called\nVideoPatchCore. Inspired by PatchCore, our approach introduces a structure that\nprioritizes memory optimization and configures three types of memory tailored\nto the characteristics of video data. This method effectively addresses the\nlimitations of existing memory-based methods, achieving good performance\ncomparable to state-of-the-art methods. Furthermore, our method requires no\ntraining and is straightforward to implement, making VAD tasks more accessible.\nOur code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2409.16225v2.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.06911v3","updated":"2024-09-26T05:47:36Z","published":"2024-06-11T03:09:37Z","title":"AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising","summary":" Diffusion models have garnered significant interest from the community for\ntheir great generative ability across various applications. However, their\ntypical multi-step sequential-denoising nature gives rise to high cumulative\nlatency, thereby precluding the possibilities of parallel computation. To\naddress this, we introduce AsyncDiff, a universal and plug-and-play\nacceleration scheme that enables model parallelism across multiple devices. Our\napproach divides the cumbersome noise prediction model into multiple\ncomponents, assigning each to a different device. To break the dependency chain\nbetween these components, it transforms the conventional sequential denoising\ninto an asynchronous process by exploiting the high similarity between hidden\nstates in consecutive diffusion steps. Consequently, each component is\nfacilitated to compute in parallel on separate devices. The proposed strategy\nsignificantly reduces inference latency while minimally impacting the\ngenerative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff\nachieves a 2.7x speedup with negligible degradation and a 4.0x speedup with\nonly a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our\nexperiments also demonstrate that AsyncDiff can be readily applied to video\ndiffusion models with encouraging performances. The code is available at\nhttps://github.com/czg1225/AsyncDiff.\n","authors":["Zigeng Chen","Xinyin Ma","Gongfan Fang","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06911v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.05284v4","updated":"2024-09-26T05:41:56Z","published":"2023-12-08T12:48:53Z","title":"SlimSAM: 0.1% Data Makes Segment Anything Slim","summary":" Current approaches for compressing the Segment Anything Model (SAM) yield\ncommendable results, yet necessitate extensive data to train a new network from\nscratch. Employing conventional pruning techniques can remarkably reduce data\nrequirements but would suffer from a degradation in performance. To address\nthis challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM\ncompression method that achieves superior performance with extremely less\ntraining data. The essence of SlimSAM is encapsulated in the alternate slimming\nframework which effectively enhances knowledge inheritance under severely\nlimited training data availability and exceptional pruning ratio. Diverging\nfrom prior techniques, our framework progressively compresses the model by\nalternately pruning and distilling distinct, decoupled sub-structures.\nDisturbed Taylor pruning is also proposed to address the misalignment between\nthe pruning objective and training target, thereby boosting the\npost-distillation after pruning. SlimSAM yields significant performance\nimprovements while demanding over 10 times less training data than any other\nexisting compression methods. Even when compared to the original SAM, SlimSAM\nachieves approaching performance while reducing parameter counts to merely 1.4%\n(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training\ndata. The code is available at http://github.com/czg1225/SlimSAM.\n","authors":["Zigeng Chen","Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05284v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17547v1","updated":"2024-09-26T05:33:30Z","published":"2024-09-26T05:33:30Z","title":"Triple Point Masking","summary":" Existing 3D mask learning methods encounter performance bottlenecks under\nlimited data, and our objective is to overcome this limitation. In this paper,\nwe introduce a triple point masking scheme, named TPM, which serves as a\nscalable framework for pre-training of masked autoencoders to achieve\nmulti-mask learning for 3D point clouds. Specifically, we augment the baselines\nwith two additional mask choices (i.e., medium mask and low mask) as our core\ninsight is that the recovery process of an object can manifest in diverse ways.\nPrevious high-masking schemes focus on capturing the global representation but\nlack the fine-grained recovery capability, so that the generated pre-trained\nweights tend to play a limited role in the fine-tuning process. With the\nsupport of the proposed TPM, available methods can exhibit more flexible and\naccurate completion capabilities, enabling the potential autoencoder in the\npre-training stage to consider multiple representations of a single 3D object.\nIn addition, an SVM-guided weight selection module is proposed to fill the\nencoder parameters for downstream networks with the optimal weight during the\nfine-tuning stage, maximizing linear accuracy and facilitating the acquisition\nof intricate representations for new objects. Extensive experiments show that\nthe four baselines equipped with the proposed TPM achieve comprehensive\nperformance improvements on various downstream tasks.\n","authors":["Jiaming Liu","Linghe Kong","Yue Wu","Maoguo Gong","Hao Li","Qiguang Miao","Wenping Ma","Can Qin"],"pdf_url":"https://arxiv.org/pdf/2409.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05769v2","updated":"2024-09-26T05:10:23Z","published":"2024-05-09T13:45:04Z","title":"Exploring Text-Guided Single Image Editing for Remote Sensing Images","summary":" Artificial intelligence generative content (AIGC) has significantly impacted\nimage generation in the field of remote sensing. However, the equally important\narea of remote sensing image (RSI) editing has not received sufficient\nattention. Deep learning based editing methods generally involve two sequential\nstages: generation and editing. During the generation stage, consistency in\ncontent and details between the original and edited images must be maintained,\nwhile in the editing stage, controllability and accuracy of the edits should be\nensured. For natural images, these challenges can be tackled by training\ngenerative backbones on large-scale benchmark datasets and using text guidance\nbased on vision-language models (VLMs). However, these previously effective\napproaches become less viable for RSIs due to two reasons: First, existing\ngenerative RSI benchmark datasets do not fully capture the diversity of remote\nsensing scenarios, particularly in terms of variations in sensors, object\ntypes, and resolutions. Consequently, the generalization capacity of the\ntrained backbone model is often inadequate for universal editing tasks on RSIs.\nSecond, the large spatial resolution of RSIs exacerbates the problem in VLMs\nwhere a single text semantic corresponds to multiple image semantics, leading\nto the introduction of incorrect semantics when using text to guide RSI\nediting. To solve above problems, this paper proposes a text-guided RSI editing\nmethod that is controllable but stable, and can be trained using only a single\nimage. It adopts a multi-scale training approach to preserve consistency\nwithout the need for training on extensive benchmark datasets, while leveraging\nRSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and\ncontrollability in the text-guided editing process.\n","authors":["Fangzhou Han","Lingyu Si","Hongwei Dong","Lamei Zhang","Hao Chen","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2405.05769v2.pdf","comment":"14 pages, 14 figures, submitted to IEEE Transactions on Geoscience\n and Remote Sensing"},{"id":"http://arxiv.org/abs/2409.17533v1","updated":"2024-09-26T04:40:38Z","published":"2024-09-26T04:40:38Z","title":"CAMOT: Camera Angle-aware Multi-Object Tracking","summary":" This paper proposes CAMOT, a simple camera angle estimator for multi-object\ntracking to tackle two problems: 1) occlusion and 2) inaccurate distance\nestimation in the depth direction. Under the assumption that multiple objects\nare located on a flat plane in each video frame, CAMOT estimates the camera\nangle using object detection. In addition, it gives the depth of each object,\nenabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D\nMOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness.\nApplying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1\nin MOT17, which are state-of-the-art results. Its computational cost is\nsignificantly lower than the existing deep-learning-based depth estimators for\ntracking.\n","authors":["Felix Limanta","Kuniaki Uto","Koichi Shinoda"],"pdf_url":"https://arxiv.org/pdf/2409.17533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17531v1","updated":"2024-09-26T04:36:19Z","published":"2024-09-26T04:36:19Z","title":"SimVG: A Simple Framework for Visual Grounding with Decoupled\n Multi-modal Fusion","summary":" Visual grounding is a common vision task that involves grounding descriptive\nsentences to the corresponding regions of an image. Most existing methods use\nindependent image-text encoding and apply complex hand-crafted modules or\nencoder-decoder architectures for modal interaction and query reasoning.\nHowever, their performance significantly drops when dealing with complex\ntextual expressions. This is because the former paradigm only utilizes limited\ndownstream data to fit the multi-modal feature fusion. Therefore, it is only\neffective when the textual expressions are relatively simple. In contrast,\ngiven the wide diversity of textual expressions and the uniqueness of\ndownstream training data, the existing fusion module, which extracts multimodal\ncontent from a visual-linguistic context, has not been fully investigated. In\nthis paper, we present a simple yet robust transformer-based framework, SimVG,\nfor visual grounding. Specifically, we decouple visual-linguistic feature\nfusion from downstream tasks by leveraging existing multimodal pre-trained\nmodels and incorporating additional object tokens to facilitate deep\nintegration of downstream and pre-training tasks. Furthermore, we design a\ndynamic weight-balance distillation method in the multi-branch synchronous\nlearning process to enhance the representation capability of the simpler\nbranch. This branch only consists of a lightweight MLP, which simplifies the\nstructure and improves reasoning speed. Experiments on six widely used VG\ndatasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the\nsuperiority of SimVG. Finally, the proposed method not only achieves\nimprovements in efficiency and convergence speed but also attains new\nstate-of-the-art performance on these benchmarks. Codes and models will be\navailable at \\url{https://github.com/Dmmm1997/SimVG}.\n","authors":["Ming Dai","Lingfeng Yang","Yihao Xu","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17531v1.pdf","comment":"21pages, 11figures, NeurIPS2024"},{"id":"http://arxiv.org/abs/2409.17526v1","updated":"2024-09-26T04:27:44Z","published":"2024-09-26T04:27:44Z","title":"Drone Stereo Vision for Radiata Pine Branch Detection and Distance\n Measurement: Integrating SGBM and Segmentation Models","summary":" Manual pruning of radiata pine trees presents significant safety risks due to\ntheir substantial height and the challenging terrains in which they thrive. To\naddress these risks, this research proposes the development of a drone-based\npruning system equipped with specialized pruning tools and a stereo vision\ncamera, enabling precise detection and trimming of branches. Deep learning\nalgorithms, including YOLO and Mask R-CNN, are employed to ensure accurate\nbranch detection, while the Semi-Global Matching algorithm is integrated to\nprovide reliable distance estimation. The synergy between these techniques\nfacilitates the precise identification of branch locations and enables\nefficient, targeted pruning. Experimental results demonstrate that the combined\nimplementation of YOLO and SGBM enables the drone to accurately detect branches\nand measure their distances from the drone. This research not only improves the\nsafety and efficiency of pruning operations but also makes a significant\ncontribution to the advancement of drone technology in the automation of\nagricultural and forestry practices, laying a foundational framework for\nfurther innovations in environmental management.\n","authors":["Yida Lin","Bing Xue","Mengjie Zhang","Sam Schofield","Richard Green"],"pdf_url":"https://arxiv.org/pdf/2409.17526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17524v1","updated":"2024-09-26T04:23:17Z","published":"2024-09-26T04:23:17Z","title":"JoyType: A Robust Design for Multilingual Visual Text Creation","summary":" Generating images with accurately represented text, especially in non-Latin\nlanguages, poses a significant challenge for diffusion models. Existing\napproaches, such as the integration of hint condition diagrams via auxiliary\nnetworks (e.g., ControlNet), have made strides towards addressing this issue.\nHowever, diffusion models often fall short in tasks requiring controlled text\ngeneration, such as specifying particular fonts or producing text in small\nfonts. In this paper, we introduce a novel approach for multilingual visual\ntext creation, named JoyType, designed to maintain the font style of text\nduring the image generation process. Our methodology begins with assembling a\ntraining dataset, JoyType-1M, comprising 1 million pairs of data. Each pair\nincludes an image, its description, and glyph instructions corresponding to the\nfont style within the image. We then developed a text control network, Font\nControlNet, tasked with extracting font style information to steer the image\ngeneration. To further enhance our model's ability to maintain font style,\nnotably in generating small-font text, we incorporated a multi-layer OCR-aware\nloss into the diffusion process. This enhancement allows JoyType to direct text\nrendering using low-level descriptors. Our evaluations, based on both visual\nand accuracy metrics, demonstrate that JoyType significantly outperforms\nexisting state-of-the-art methods. Additionally, JoyType can function as a\nplugin, facilitating the creation of varied image styles in conjunction with\nother stable diffusion models on HuggingFace and CivitAI. Our project is\nopen-sourced on https://jdh-algo.github.io/JoyType/.\n","authors":["Chao Li","Chen Jiang","Xiaolong Liu","Jun Zhao","Guoxin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17524v1.pdf","comment":"Under Review at AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17523v1","updated":"2024-09-26T04:17:27Z","published":"2024-09-26T04:17:27Z","title":"EAGLE: Egocentric AGgregated Language-video Engine","summary":" The rapid evolution of egocentric video analysis brings new insights into\nunderstanding human activities and intentions from a first-person perspective.\nDespite this progress, the fragmentation in tasks like action recognition,\nprocedure learning, and moment retrieval, \\etc, coupled with inconsistent\nannotations and isolated model development, hinders a holistic interpretation\nof video content. In response, we introduce the EAGLE (Egocentric AGgregated\nLanguage-video Engine) model and the EAGLE-400K dataset to provide a unified\nframework that integrates various egocentric video understanding tasks.\nEAGLE-400K, the \\textit{first} large-scale instruction-tuning dataset tailored\nfor egocentric video, features 400K diverse samples to enhance a broad spectrum\nof tasks from activity recognition to procedure knowledge learning. Moreover,\nEAGLE, a strong video multimodal large language model (MLLM), is designed to\neffectively capture both spatial and temporal information. In addition, we\npropose a set of evaluation metrics designed to facilitate a thorough\nassessment of MLLM for egocentric video understanding. Our extensive\nexperiments demonstrate EAGLE's superior performance over existing models,\nhighlighting its ability to balance task-specific understanding with holistic\nvideo interpretation. With EAGLE, we aim to pave the way for research\nopportunities and practical applications in real-world scenarios.\n","authors":["Jing Bi","Yunlong Tang","Luchuan Song","Ali Vosoughi","Nguyen Nguyen","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17523v1.pdf","comment":"Accepted by ACMMM 24"},{"id":"http://arxiv.org/abs/2409.17519v1","updated":"2024-09-26T04:02:20Z","published":"2024-09-26T04:02:20Z","title":"Robotic Environmental State Recognition with Pre-Trained Vision-Language\n Models and Black-Box Optimization","summary":" In order for robots to autonomously navigate and operate in diverse\nenvironments, it is essential for them to recognize the state of their\nenvironment. On the other hand, the environmental state recognition has\ntraditionally involved distinct methods tailored to each state to be\nrecognized. In this study, we perform a unified environmental state recognition\nfor robots through the spoken language with pre-trained large-scale\nvision-language models. We apply Visual Question Answering and Image-to-Text\nRetrieval, which are tasks of Vision-Language Models. We show that with our\nmethod, it is possible to recognize not only whether a room door is\nopen/closed, but also whether a transparent door is open/closed and whether\nwater is running in a sink, without training neural networks or manual\nprogramming. In addition, the recognition accuracy can be improved by selecting\nappropriate texts from the set of prepared texts based on black-box\noptimization. For each state recognition, only the text set and its weighting\nneed to be changed, eliminating the need to prepare multiple different models\nand programs, and facilitating the management of source code and computer\nresource. We experimentally demonstrate the effectiveness of our method and\napply it to the recognition behavior on a mobile robot, Fetch.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2409.17519v1.pdf","comment":"Accepted at Advanced Robotics, website -\n https://haraduka.github.io/vlm-bbo/"},{"id":"http://arxiv.org/abs/2404.05705v2","updated":"2024-09-26T03:58:11Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v2.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2409.17512v1","updated":"2024-09-26T03:47:34Z","published":"2024-09-26T03:47:34Z","title":"SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning","summary":" Open-set semi-supervised learning (OSSL) leverages practical open-set\nunlabeled data, comprising both in-distribution (ID) samples from seen classes\nand out-of-distribution (OOD) samples from unseen classes, for semi-supervised\nlearning (SSL). Prior OSSL methods initially learned the decision boundary\nbetween ID and OOD with labeled ID data, subsequently employing self-training\nto refine this boundary. These methods, however, suffer from the tendency to\novertrust the labeled ID data: the scarcity of labeled data caused the\ndistribution bias between the labeled samples and the entire ID data, which\nmisleads the decision boundary to overfit. The subsequent self-training\nprocess, based on the overfitted result, fails to rectify this problem. In this\npaper, we address the overtrusting issue by treating OOD samples as an\nadditional class, forming a new SSL process.\n Specifically, we propose SCOMatch, a novel OSSL method that 1) selects\nreliable OOD samples as new labeled data with an OOD memory queue and a\ncorresponding update strategy and 2) integrates the new SSL process into the\noriginal task through our Simultaneous Close-set and Open-set self-training.\nSCOMatch refines the decision boundary of ID and OOD classes across the entire\ndataset, thereby leading to improved results. Extensive experimental results\nshow that SCOMatch significantly outperforms the state-of-the-art methods on\nvarious benchmarks. The effectiveness is further verified through ablation\nstudies and visualization.\n","authors":["Zerun Wang","Liuyu Xiang","Lang Huang","Jiafeng Mao","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2409.17512v1.pdf","comment":"ECCV 2024 accepted"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.00591v4","updated":"2024-09-26T03:34:23Z","published":"2024-08-01T14:20:47Z","title":"Regional quality estimation for echocardiography using deep learning","summary":" Automatic estimation of cardiac ultrasound image quality can be beneficial\nfor guiding operators and ensuring the accuracy of clinical measurements.\nPrevious work often fails to distinguish the view correctness of the\nechocardiogram from the image quality. Additionally, previous studies only\nprovide a global image quality value, which limits their practical utility. In\nthis work, we developed and compared three methods to estimate image quality:\n1) classic pixel-based metrics like the generalized contrast-to-noise ratio\n(gCNR) on myocardial segments as region of interest and left ventricle lumen as\nbackground, obtained using a U-Net segmentation 2) local image coherence\nderived from a U-Net model that predicts coherence from B-Mode images 3) a deep\nconvolutional network that predicts the quality of each region directly in an\nend-to-end fashion. We evaluate each method against manual regional image\nquality annotations by three experienced cardiologists. The results indicate\npoor performance of the gCNR metric, with Spearman correlation to the\nannotations of rho = 0.24. The end-to-end learning model obtains the best\nresult, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63.\nFinally, the coherence-based method, with rho = 0.58, outperformed the\nclassical metrics and is more generic than the end-to-end approach. The image\nquality prediction tool is available as an open source Python library at\nhttps://github.com/GillesVanDeVyver/arqee.\n","authors":["Gilles Van De Vyver","Svein-Erik Måsøy","Håvard Dalen","Bjørnar Leangen Grenne","Espen Holte","Sindre Hellum Olaisen","John Nyberg","Andreas Østvik","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2408.00591v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21341v2","updated":"2024-09-26T03:31:28Z","published":"2024-07-31T05:15:24Z","title":"High-throughput 3D shape completion of potato tubers on a harvester","summary":" Potato yield is an important metric for farmers to further optimize their\ncultivation practices. Potato yield can be estimated on a harvester using an\nRGB-D camera that can estimate the three-dimensional (3D) volume of individual\npotato tubers. A challenge, however, is that the 3D shape derived from RGB-D\nimages is only partially completed, underestimating the actual volume. To\naddress this issue, we developed a 3D shape completion network, called CoRe++,\nwhich can complete the 3D shape from RGB-D images. CoRe++ is a deep learning\nnetwork that consists of a convolutional encoder and a decoder. The encoder\ncompresses RGB-D images into latent vectors that are used by the decoder to\ncomplete the 3D shape using the deep signed distance field network (DeepSDF).\nTo evaluate our CoRe++ network, we collected partial and complete 3D point\nclouds of 339 potato tubers on an operational harvester in Japan. On the 1425\nRGB-D images in the test set (representing 51 unique potato tubers), our\nnetwork achieved a completion accuracy of 2.8 mm on average. For volumetric\nestimation, the root mean squared error (RMSE) was 22.6 ml, and this was better\nthan the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml).\nWe found that the RMSE can be further reduced to 18.2 ml when performing the 3D\nshape completion in the center of the RGB-D image. With an average 3D shape\ncompletion time of 10 milliseconds per tuber, we can conclude that CoRe++ is\nboth fast and accurate enough to be implemented on an operational harvester for\nhigh-throughput potato yield estimation. Our method can also be applied to\nother tuber, fruit and vegetable crops, thereby enabling versatile, accurate\nand real-time yield monitoring in precision agriculture. Our code, network\nweights and dataset are publicly available at\nhttps://github.com/UTokyo-FieldPhenomics-Lab/corepp.git.\n","authors":["Pieter M. Blok","Federico Magistri","Cyrill Stachniss","Haozhou Wang","James Burridge","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21341v2.pdf","comment":"20 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.17503v1","updated":"2024-09-26T03:21:21Z","published":"2024-09-26T03:21:21Z","title":"Shape-intensity knowledge distillation for robust medical image\n segmentation","summary":" Many medical image segmentation methods have achieved impressive results.\nYet, most existing methods do not take into account the shape-intensity prior\ninformation. This may lead to implausible segmentation results, in particular\nfor images of unseen datasets. In this paper, we propose a novel approach to\nincorporate joint shape-intensity prior information into the segmentation\nnetwork. Specifically, we first train a segmentation network (regarded as the\nteacher network) on class-wise averaged training images to extract valuable\nshape-intensity information, which is then transferred to a student\nsegmentation network with the same network architecture as the teacher via\nknowledge distillation. In this way, the student network regarded as the final\nsegmentation model can effectively integrate the shape-intensity prior\ninformation, yielding more accurate segmentation results. Despite its\nsimplicity, experiments on five medical image segmentation tasks of different\nmodalities demonstrate that the proposed Shape-Intensity Knowledge Distillation\n(SIKD) consistently improves several baseline models (including recent MaxStyle\nand SAMed) under intra-dataset evaluation, and significantly improves the\ncross-dataset generalization ability. The code is available at\nhttps://github.com/whdong-whu/SIKD.\n","authors":["Wenhui Dong","Bo Du","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"},{"id":"http://arxiv.org/abs/2409.17487v1","updated":"2024-09-26T02:49:51Z","published":"2024-09-26T02:49:51Z","title":"Learning Quantized Adaptive Conditions for Diffusion Models","summary":" The curvature of ODE trajectories in diffusion models hinders their ability\nto generate high-quality images in a few number of function evaluations (NFE).\nIn this paper, we propose a novel and effective approach to reduce trajectory\ncurvature by utilizing adaptive conditions. By employing a extremely\nlight-weight quantized encoder, our method incurs only an additional 1% of\ntraining parameters, eliminates the need for extra regularization terms, yet\nachieves significantly better sample quality. Our approach accelerates ODE\nsampling while preserving the downstream task image editing capabilities of SDE\ntechniques. Extensive experiments verify that our method can generate high\nquality results under extremely limited sampling costs. With only 6 NFE, we\nachieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2.\n","authors":["Yuchen Liang","Yuchuan Tian","Lei Yu","Huao Tang","Jie Hu","Xiangzhong Fang","Hanting Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17486v1","updated":"2024-09-26T02:48:15Z","published":"2024-09-26T02:48:15Z","title":"Global-Local Medical SAM Adaptor Based on Full Adaption","summary":" Emerging of visual language models, such as the segment anything model (SAM),\nhave made great breakthroughs in the field of universal semantic segmentation\nand significantly aid the improvements of medical image segmentation, in\nparticular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still\ncan be improved, as it fine-tunes SAM in a partial adaption manner. To resolve\nthis problem, we present a novel global medical SAM adaptor (GMed-SA) with full\nadaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA\nto propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both\nglobally and locally. Extensive experiments have been performed on the\nchallenging public 2D melanoma segmentation dataset. The results show that\nGLMed-SA outperforms several state-of-the-art semantic segmentation methods on\nvarious evaluation metrics, demonstrating the superiority of our methods.\n","authors":["Meng Wang","Yarong Feng","Yongwei Tang","Tian Zhang","Yuxin Liang","Chao Lv"],"pdf_url":"https://arxiv.org/pdf/2409.17486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17485v1","updated":"2024-09-26T02:47:41Z","published":"2024-09-26T02:47:41Z","title":"Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly\n Detection","summary":" Medical anomaly detection (AD) is crucial in pathological identification and\nlocalization. Current methods typically rely on uncertainty estimation in deep\nensembles to detect anomalies, assuming that ensemble learners should agree on\nnormal samples while exhibiting disagreement on unseen anomalies in the output\nspace. However, these methods may suffer from inadequate disagreement on\nanomalies or diminished agreement on normal samples. To tackle these issues, we\npropose D2UE, a Diversified Dual-space Uncertainty Estimation framework for\nmedical anomaly detection. To effectively balance agreement and disagreement\nfor anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses\na similarity kernel that remains invariant to both isotropic scaling and\northogonal transformations, explicitly promoting diversity in learners' feature\nspace. Moreover, to accentuate anomalous regions, we develop Dual-Space\nUncertainty (DSU), which utilizes the ensemble's uncertainty in input and\noutput spaces. In input space, we first calculate gradients of reconstruction\nerror with respect to input images. The gradients are then integrated with\nreconstruction outputs to estimate uncertainty for inputs, enabling effective\nanomaly discrimination even when output space disagreement is minimal. We\nconduct a comprehensive evaluation of five medical benchmarks with different\nbackbones. Experimental results demonstrate the superiority of our method to\nstate-of-the-art methods and the effectiveness of each component in our\nframework. Our code is available at https://github.com/Rubiscol/D2UE.\n","authors":["Yi Gu","Yi Lin","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17485v1.pdf","comment":"Early accepted by MICCAI2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.18110v1","updated":"2024-09-26T17:52:57Z","published":"2024-09-26T17:52:57Z","title":"Open-World Evaluation for Retrieving Diverse Perspectives","summary":" We study retrieving a set of documents that covers various perspectives on a\ncomplex and contentious question (e.g., will ChatGPT do more harm than good?).\nWe curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS),\nwhere each example consists of a question and diverse perspectives associated\nwith the question, sourced from survey questions and debate websites. On this\ndata, retrievers paired with a corpus are evaluated to surface a document set\nthat contains diverse perspectives. Our framing diverges from most retrieval\ntasks in that document relevancy cannot be decided by simple string matches to\nreferences. Instead, we build a language model based automatic evaluator that\ndecides whether each retrieved document contains a perspective. This allows us\nto evaluate the performance of three different types of corpus (Wikipedia, web\nsnapshot, and corpus constructed on the fly with retrieved pages from the\nsearch engine) paired with retrievers. Retrieving diverse documents remains\nchallenging, with the outputs from existing retrievers covering all\nperspectives on only 33.74% of the examples. We further study the impact of\nquery expansion and diversity-focused reranking approaches and analyze\nretriever sycophancy. Together, we lay the foundation for future studies in\nretrieval diversity handling complex queries.\n","authors":["Hung-Ting Chen","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2409.18110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2409.18024v1","updated":"2024-09-26T16:32:10Z","published":"2024-09-26T16:32:10Z","title":"Report on the Workshop on Simulations for Information Access (Sim4IA\n 2024) at SIGIR 2024","summary":" This paper is a report of the Workshop on Simulations for Information Access\n(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel\ndiscussion, nine lightning talks, and two breakout sessions. Key takeaways were\nuser simulation's importance in academia and industry, the possible bridging of\nonline and offline evaluation, and the issues of organizing a companion shared\ntask around user simulations for information access. We report on how we\norganized the workshop, provide a brief overview of what happened at the\nworkshop, and summarize the main topics and findings of the workshop and future\nwork.\n","authors":["Timo Breuer","Christin Katharina Kreutz","Norbert Fuhr","Krisztian Balog","Philipp Schaer","Nolwenn Bernard","Ingo Frommholz","Marcel Gohsen","Kaixin Ji","Gareth J. F. Jones","Jüri Keller","Jiqun Liu","Martin Mladenov","Gabriella Pasi","Johanne Trippas","Xi Wang","Saber Zerhoudi","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.18024v1.pdf","comment":"Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December\n 2024"},{"id":"http://arxiv.org/abs/2409.18003v1","updated":"2024-09-26T16:12:33Z","published":"2024-09-26T16:12:33Z","title":"Enhancing Tourism Recommender Systems for Sustainable City Trips Using\n Retrieval-Augmented Generation","summary":" Tourism Recommender Systems (TRS) have traditionally focused on providing\npersonalized travel suggestions, often prioritizing user preferences without\nconsidering broader sustainability goals. Integrating sustainability into TRS\nhas become essential with the increasing need to balance environmental impact,\nlocal community interests, and visitor satisfaction. This paper proposes a\nnovel approach to enhancing TRS for sustainable city trips using Large Language\nModels (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We\nenhance the traditional RAG system by incorporating a sustainability metric\nbased on a city's popularity and seasonal demand during the prompt augmentation\nphase. This modification, called Sustainability Augmented Reranking (SAR),\nensures the system's recommendations align with sustainability goals.\nEvaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and\nMistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently\nmatches or outperforms the baseline (without SAR) across most metrics,\nhighlighting the benefits of incorporating sustainability into TRS.\n","authors":["Ashmi Banerjee","Adithi Satish","Wolfgang Wörndl"],"pdf_url":"https://arxiv.org/pdf/2409.18003v1.pdf","comment":"Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM\n Conference on Recommender Systems (RecSys 2024)"},{"id":"http://arxiv.org/abs/2409.13740v2","updated":"2024-09-26T15:27:08Z","published":"2024-09-10T16:37:58Z","title":"Language agents achieve superhuman synthesis of scientific knowledge","summary":" Language models are known to hallucinate incorrect information, and it is\nunclear if they are sufficiently accurate and reliable for use in scientific\nresearch. We developed a rigorous human-AI comparison methodology to evaluate\nlanguage model agents on real-world literature search tasks covering\ninformation retrieval, summarization, and contradiction detection tasks. We\nshow that PaperQA2, a frontier language model agent optimized for improved\nfactuality, matches or exceeds subject matter expert performance on three\nrealistic literature research tasks without any restrictions on humans (i.e.,\nfull access to internet, search tools, and time). PaperQA2 writes cited,\nWikipedia-style summaries of scientific topics that are significantly more\naccurate than existing, human-written Wikipedia articles. We also introduce a\nhard benchmark for scientific literature research called LitQA2 that guided\ndesign of PaperQA2, leading to it exceeding human performance. Finally, we\napply PaperQA2 to identify contradictions within the scientific literature, an\nimportant scientific task that is challenging for humans. PaperQA2 identifies\n2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of\nwhich 70% are validated by human experts. These results demonstrate that\nlanguage model agents are now capable of exceeding domain experts across\nmeaningful tasks on scientific literature.\n","authors":["Michael D. Skarlinski","Sam Cox","Jon M. Laurent","James D. Braza","Michaela Hinks","Michael J. Hammerling","Manvitha Ponnapati","Samuel G. Rodriques","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2409.13740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17769v1","updated":"2024-09-26T12:07:46Z","published":"2024-09-26T12:07:46Z","title":"Value Identification in Multistakeholder Recommender Systems for\n Humanities and Historical Research: The Case of the Digital Archive\n Monasterium.net","summary":" Recommender systems remain underutilized in humanities and historical\nresearch, despite their potential to enhance the discovery of cultural records.\nThis paper offers an initial value identification of the multiple stakeholders\nthat might be impacted by recommendations in Monasterium.net, a digital archive\nfor historical legal documents. Specifically, we discuss the diverse values and\nobjectives of its stakeholders, such as editors, aggregators, platform owners,\nresearchers, publishers, and funding agencies. These in-depth insights into the\npotentially conflicting values of stakeholder groups allow designing and\nadapting recommender systems to enhance their usefulness for humanities and\nhistorical research. Additionally, our findings will support deeper engagement\nwith additional stakeholders to refine value models and evaluation metrics for\nrecommender systems in the given domains. Our conclusions are embedded in and\napplicable to other digital archives and a broader cultural heritage context.\n","authors":["Florian Atzenhofer-Baumgartner","Bernhard C. Geiger","Georg Vogeler","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2409.17769v1.pdf","comment":"To be presented at: NORMalize 2024: The Second Workshop on the\n Normative Design and Evaluation of Recommender Systems, October 18, 2024,\n co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024),\n Bari, Italy"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2409.17580v1","updated":"2024-09-26T06:53:29Z","published":"2024-09-26T06:53:29Z","title":"Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case\n Study","summary":" Extracting meaningful insights from large and complex datasets poses\nsignificant challenges, particularly in ensuring the accuracy and relevance of\nretrieved information. Traditional data retrieval methods such as sequential\nsearch and index-based retrieval often fail when handling intricate and\ninterconnected data structures, resulting in incomplete or misleading outputs.\nTo overcome these limitations, we introduce Structured-GraphRAG, a versatile\nframework designed to enhance information retrieval across structured datasets\nin natural language queries. Structured-GraphRAG utilizes multiple knowledge\ngraphs, which represent data in a structured format and capture complex\nrelationships between entities, enabling a more nuanced and comprehensive\nretrieval of information. This graph-based approach reduces the risk of errors\nin language model outputs by grounding responses in a structured format,\nthereby enhancing the reliability of results. We demonstrate the effectiveness\nof Structured-GraphRAG by comparing its performance with that of a recently\npublished method using traditional retrieval-augmented generation. Our findings\nshow that Structured-GraphRAG significantly improves query processing\nefficiency and reduces response times. While our case study focuses on soccer\ndata, the framework's design is broadly applicable, offering a powerful tool\nfor data analysis and enhancing language model applications across various\nstructured domains.\n","authors":["Zahra Sepasdar","Sushant Gautam","Cise Midoglu","Michael A. Riegler","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2409.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05013v2","updated":"2024-09-26T06:19:34Z","published":"2024-06-07T15:23:53Z","title":"CHIQ: Contextual History Enhancement for Improving Query Rewriting in\n Conversational Search","summary":" In this paper, we study how open-source large language models (LLMs) can be\neffectively deployed for improving query rewriting in conversational search,\nespecially for ambiguous queries. We introduce CHIQ, a two-step method that\nleverages the capabilities of LLMs to resolve ambiguities in the conversation\nhistory before query rewriting. This approach contrasts with prior studies that\npredominantly use closed-source LLMs to directly generate search queries from\nconversation history. We demonstrate on five well-established benchmarks that\nCHIQ leads to state-of-the-art results across most settings, showing highly\ncompetitive performances with systems leveraging closed-source LLMs. Our study\nprovides a first step towards leveraging open-source LLMs in conversational\nsearch, as a competitive alternative to the prevailing reliance on commercial\nLLMs. Data, models, and source code will be publicly available upon acceptance\nat https://github.com/fengranMark/CHIQ.\n","authors":["Fengran Mo","Abbas Ghaddar","Kelong Mao","Mehdi Rezagholizadeh","Boxing Chen","Qun Liu","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2406.05013v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.15763v2","updated":"2024-09-26T05:43:08Z","published":"2024-09-24T05:39:53Z","title":"IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through\n Semantic Comprehension in Retrieval-Augmented Generation Scenarios","summary":" In Retrieval-Augmented Generation (RAG) tasks using Large Language Models\n(LLMs), the quality of retrieved information is critical to the final output.\nThis paper introduces the IRSC benchmark for evaluating the performance of\nembedding models in multilingual RAG tasks. The benchmark encompasses five\nretrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval,\nkeyword retrieval, and summary retrieval. Our research addresses the current\nlack of comprehensive testing and effective comparison methods for embedding\nmodels in RAG scenarios. We introduced new metrics: the Similarity of Semantic\nComprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI),\nand evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our\ncontributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and\n3) insights into the cross-lingual limitations of embedding models. The IRSC\nbenchmark aims to enhance the understanding and development of accurate\nretrieval systems in RAG tasks. All code and datasets are available at:\nhttps://github.com/Jasaxion/IRSC_Benchmark\n","authors":["Hai Lin","Shaoxiong Zhan","Junyou Su","Haitao Zheng","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10743v4","updated":"2024-09-26T03:38:59Z","published":"2023-12-17T15:28:06Z","title":"A Unified Framework for Multi-Domain CTR Prediction via Large Language\n Models","summary":" Click-Through Rate (CTR) prediction is a crucial task in online\nrecommendation platforms as it involves estimating the probability of user\nengagement with advertisements or items by clicking on them. Given the\navailability of various services like online shopping, ride-sharing, food\ndelivery, and professional services on commercial platforms, recommendation\nsystems in these platforms are required to make CTR predictions across multiple\ndomains rather than just a single domain. However, multi-domain click-through\nrate (MDCTR) prediction remains a challenging task in online recommendation due\nto the complex mutual influence between domains. Traditional MDCTR models\ntypically encode domains as discrete identifiers, ignoring rich semantic\ninformation underlying. Consequently, they can hardly generalize to new\ndomains. Besides, existing models can be easily dominated by some specific\ndomains, which results in significant performance drops in the other domains\n(i.e. the \"seesaw phenomenon\"). In this paper, we propose a novel solution\nUni-CTR to address the above challenges. Uni-CTR leverages a backbone Large\nLanguage Model (LLM) to learn layer-wise semantic representations that capture\ncommonalities between domains. Uni-CTR also uses several domain-specific\nnetworks to capture the characteristics of each domain. Note that we design a\nmasked loss strategy so that these domain-specific networks are decoupled from\nbackbone LLM. This allows domain-specific networks to remain unchanged when\nincorporating new or removing domains, thereby enhancing the flexibility and\nscalability of the system significantly. Experimental results on three public\ndatasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models\nsignificantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in\nzero-shot prediction. We have applied Uni-CTR in industrial scenarios,\nconfirming its efficiency.\n","authors":["Zichuan Fu","Xiangyang Li","Chuhan Wu","Yichao Wang","Kuicai Dong","Xiangyu Zhao","Mengchen Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2312.10743v4.pdf","comment":"Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS)"},{"id":"http://arxiv.org/abs/2409.17476v1","updated":"2024-09-26T02:24:03Z","published":"2024-09-26T02:24:03Z","title":"Improving the Shortest Plank: Vulnerability-Aware Adversarial Training\n for Robust Recommender System","summary":" Recommender systems play a pivotal role in mitigating information overload in\nvarious fields. Nonetheless, the inherent openness of these systems introduces\nvulnerabilities, allowing attackers to insert fake users into the system's\ntraining data to skew the exposure of certain items, known as poisoning\nattacks. Adversarial training has emerged as a notable defense mechanism\nagainst such poisoning attacks within recommender systems. Existing adversarial\ntraining methods apply perturbations of the same magnitude across all users to\nenhance system robustness against attacks. Yet, in reality, we find that\nattacks often affect only a subset of users who are vulnerable. These\nperturbations of indiscriminate magnitude make it difficult to balance\neffective protection for vulnerable users without degrading recommendation\nquality for those who are not affected. To address this issue, our research\ndelves into understanding user vulnerability. Considering that poisoning\nattacks pollute the training data, we note that the higher degree to which a\nrecommender system fits users' training data correlates with an increased\nlikelihood of users incorporating attack information, indicating their\nvulnerability. Leveraging these insights, we introduce the Vulnerability-aware\nAdversarial Training (VAT), designed to defend against poisoning attacks in\nrecommender systems. VAT employs a novel vulnerability-aware function to\nestimate users' vulnerability based on the degree to which the system fits\nthem. Guided by this estimation, VAT applies perturbations of adaptive\nmagnitude to each user, not only reducing the success ratio of attacks but also\npreserving, and potentially enhancing, the quality of recommendations.\nComprehensive experiments confirm VAT's superior defensive capabilities across\ndifferent recommendation models and against various types of attacks.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2409.17476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17460v1","updated":"2024-09-26T01:38:05Z","published":"2024-09-26T01:38:05Z","title":"Towards More Relevant Product Search Ranking Via Large Language Models:\n An Empirical Study","summary":" Training Learning-to-Rank models for e-commerce product search ranking can be\nchallenging due to the lack of a gold standard of ranking relevance. In this\npaper, we decompose ranking relevance into content-based and engagement-based\naspects, and we propose to leverage Large Language Models (LLMs) for both label\nand feature generation in model training, primarily aiming to improve the\nmodel's predictive capability for content-based relevance. Additionally, we\nintroduce different sigmoid transformations on the LLM outputs to polarize\nrelevance scores in labeling, enhancing the model's ability to balance\ncontent-based and engagement-based relevances and thus prioritize highly\nrelevant items overall. Comprehensive online tests and offline evaluations are\nalso conducted for the proposed design. Our work sheds light on advanced\nstrategies for integrating LLMs into e-commerce product search ranking model\ntraining, offering a pathway to more effective and balanced models with\nimproved ranking relevance.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan"],"pdf_url":"https://arxiv.org/pdf/2409.17460v1.pdf","comment":"To be published in CIKM 2024 GenAIECommerce Workshop"},{"id":"http://arxiv.org/abs/2409.17456v1","updated":"2024-09-26T01:18:29Z","published":"2024-09-26T01:18:29Z","title":"Long or Short or Both? An Exploration on Lookback Time Windows of\n Behavioral Features in Product Search Ranking","summary":" Customer shopping behavioral features are core to product search ranking\nmodels in eCommerce. In this paper, we investigate the effect of lookback time\nwindows when aggregating these features at the (query, product) level over\nhistory. By studying the pros and cons of using long and short time windows, we\npropose a novel approach to integrating these historical behavioral features of\ndifferent time windows. In particular, we address the criticality of using\nquery-level vertical signals in ranking models to effectively aggregate all\ninformation from different behavioral features. Anecdotal evidence for the\nproposed approach is also provided using live product search traffic on\nWalmart.com.\n","authors":["Qi Liu","Atul Singh","Jingbo Liu","Cun Mu","Zheng Yan","Jan Pedersen"],"pdf_url":"https://arxiv.org/pdf/2409.17456v1.pdf","comment":"Published in ACM SIGIR Workshop on eCommerce 2024"},{"id":"http://arxiv.org/abs/2409.17436v1","updated":"2024-09-26T00:08:46Z","published":"2024-09-26T00:08:46Z","title":"Minimizing Live Experiments in Recommender Systems: User Simulation to\n Evaluate Preference Elicitation Policies","summary":" Evaluation of policies in recommender systems typically involves A/B testing\nusing live experiments on real users to assess a new policy's impact on\nrelevant metrics. This ``gold standard'' comes at a high cost, however, in\nterms of cycle time, user cost, and potential user retention. In developing\npolicies for ``onboarding'' new users, these costs can be especially\nproblematic, since on-boarding occurs only once. In this work, we describe a\nsimulation methodology used to augment (and reduce) the use of live\nexperiments. We illustrate its deployment for the evaluation of ``preference\nelicitation'' algorithms used to onboard new users of the YouTube Music\nplatform. By developing counterfactually robust user behavior models, and a\nsimulation service that couples such models with production infrastructure, we\nare able to test new algorithms in a way that reliably predicts their\nperformance on key metrics when deployed live. We describe our domain, our\nsimulation models and platform, results of experiments and deployment, and\nsuggest future steps needed to further realistic simulation as a powerful\ncomplement to live experiments.\n","authors":["Chih-Wei Hsu","Martin Mladenov","Ofer Meshi","James Pine","Hubert Pham","Shane Li","Xujian Liang","Anton Polishko","Li Yang","Ben Scheetz","Craig Boutilier"],"pdf_url":"https://arxiv.org/pdf/2409.17436v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.18119v1","updated":"2024-09-26T17:56:59Z","published":"2024-09-26T17:56:59Z","title":"Multi-View and Multi-Scale Alignment for Contrastive Language-Image\n Pre-training in Mammography","summary":" Contrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to\nthese restrictions, existing CLIP applications in medical imaging focus mainly\non modalities like chest X-rays that have abundant image-report data available,\nleaving many other important modalities under-explored. Here, we propose the\nfirst adaptation of the full CLIP model to mammography, which presents\nsignificant challenges due to labeled data scarcity, high-resolution images\nwith small regions of interest, and data imbalance. We first develop a\nspecialized supervision framework for mammography that leverages its multi-view\nnature. Furthermore, we design a symmetric local alignment module to better\nfocus on detailed features in high-resolution images. Lastly, we incorporate a\nparameter-efficient fine-tuning approach for large language models pre-trained\nwith medical knowledge to address data limitations. Our multi-view and\nmulti-scale alignment (MaMA) method outperforms state-of-the-art baselines for\nthree different tasks on two large real-world mammography datasets, EMBED and\nRSNA-Mammo, with only 52% model size compared with the largest baseline.\n","authors":["Yuexi Du","John Onofrey","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2409.18119v1.pdf","comment":"This work is also the basis of the overall best solution for the\n MICCAI 2024 CXR-LT Challenge"},{"id":"http://arxiv.org/abs/2310.13387v2","updated":"2024-09-26T17:55:48Z","published":"2023-10-20T09:56:07Z","title":"Assumption violations in causal discovery and the robustness of score\n matching","summary":" When domain knowledge is limited and experimentation is restricted by\nethical, financial, or time constraints, practitioners turn to observational\ncausal discovery methods to recover the causal structure, exploiting the\nstatistical properties of their data. Because causal discovery without further\nassumptions is an ill-posed problem, each algorithm comes with its own set of\nusually untestable assumptions, some of which are hard to meet in real\ndatasets. Motivated by these considerations, this paper extensively benchmarks\nthe empirical performance of recent causal discovery methods on observational\ni.i.d. data generated under different background conditions, allowing for\nviolations of the critical assumptions required by each selected approach. Our\nexperimental findings show that score matching-based methods demonstrate\nsurprising performance in the false positive and false negative rate of the\ninferred graph in these challenging scenarios, and we provide theoretical\ninsights into their performance. This work is also the first effort to\nbenchmark the stability of causal discovery algorithms with respect to the\nvalues of their hyperparameters. Finally, we hope this paper will set a new\nstandard for the evaluation of causal discovery methods and can serve as an\naccessible entry point for practitioners interested in the field, highlighting\nthe empirical implications of different algorithm choices.\n","authors":["Francesco Montagna","Atalanti A. Mastakouri","Elias Eulig","Nicoletta Noceti","Lorenzo Rosasco","Dominik Janzing","Bryon Aragam","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2310.13387v2.pdf","comment":"37th Conference on Neural Information Processing Systems (NeurIPS\n 2023)"},{"id":"http://arxiv.org/abs/2409.18104v1","updated":"2024-09-26T17:49:20Z","published":"2024-09-26T17:49:20Z","title":"Find Rhinos without Finding Rhinos: Active Learning with Multimodal\n Imagery of South African Rhino Habitats","summary":" Much of Earth's charismatic megafauna is endangered by human activities,\nparticularly the rhino, which is at risk of extinction due to the poaching\ncrisis in Africa. Monitoring rhinos' movement is crucial to their protection\nbut has unfortunately proven difficult because rhinos are elusive. Therefore,\ninstead of tracking rhinos, we propose the novel approach of mapping communal\ndefecation sites, called middens, which give information about rhinos' spatial\nbehavior valuable to anti-poaching, management, and reintroduction efforts.\nThis paper provides the first-ever mapping of rhino midden locations by\nbuilding classifiers to detect them using remotely sensed thermal, RGB, and\nLiDAR imagery in passive and active learning settings. As existing active\nlearning methods perform poorly due to the extreme class imbalance in our\ndataset, we design MultimodAL, an active learning system employing a ranking\ntechnique and multimodality to achieve competitive performance with passive\nlearning models with 94% fewer labels. Our methods could therefore save over 76\nhours in labeling time when used on a similarly-sized dataset. Unexpectedly,\nour midden map reveals that rhino middens are not randomly distributed\nthroughout the landscape; rather, they are clustered. Consequently, rangers\nshould be targeted at areas with high midden densities to strengthen\nanti-poaching efforts, in line with UN Target 15.7.\n","authors":["Lucia Gordon","Nikhil Behari","Samuel Collier","Elizabeth Bondi-Kelly","Jackson A. Killian","Catherine Ressijac","Peter Boucher","Andrew Davies","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2409.18104v1.pdf","comment":"9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good"},{"id":"http://arxiv.org/abs/2409.18102v1","updated":"2024-09-26T17:45:10Z","published":"2024-09-26T17:45:10Z","title":"MALPOLON: A Framework for Deep Species Distribution Modeling","summary":" This paper describes a deep-SDM framework, MALPOLON. Written in Python and\nbuilt upon the PyTorch library, this framework aims to facilitate training and\ninferences of deep species distribution models (deep-SDM) and sharing for users\nwith only general Python language skills (e.g., modeling ecologists) who are\ninterested in testing deep learning approaches to build new SDMs. More advanced\nusers can also benefit from the framework's modularity to run more specific\nexperiments by overriding existing classes while taking advantage of\npress-button examples to train neural networks on multiple classification tasks\nusing custom or provided raw and pre-processed datasets. The framework is\nopen-sourced on GitHub and PyPi along with extensive documentation and examples\nof use in various scenarios. MALPOLON offers straightforward installation,\nYAML-based configuration, parallel computing, multi-GPU utilization, baseline\nand foundational models for benchmarking, and extensive\ntutorials/documentation, aiming to enhance accessibility and performance\nscalability for ecologists and researchers.\n","authors":["Theo Larcher","Lukas Picek","Benjamin Deneu","Titouan Lorieul","Maximilien Servajean","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2409.18102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18100v1","updated":"2024-09-26T17:44:29Z","published":"2024-09-26T17:44:29Z","title":"Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine\n Segmentation","summary":" Self-supervised pretraining (SSP) has shown promising results in learning\nfrom large unlabeled datasets and, thus, could be useful for automated\ncardiovascular magnetic resonance (CMR) short-axis cine segmentation. However,\ninconsistent reports of the benefits of SSP for segmentation have made it\ndifficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP\nmethods for CMR cine segmentation.\n To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were\nused for unlabeled pretraining with four SSP methods; SimCLR, positional\ncontrastive learning, DINO, and masked image modeling (MIM). Subsets of varying\nnumbers of subjects were used for supervised fine-tuning of 2D models for each\nSSP method, as well as to train a 2D baseline model from scratch. The\nfine-tuned models were compared to the baseline using the 3D Dice similarity\ncoefficient (DSC) in a test dataset of 140 subjects.\n The SSP methods showed no performance gains with the largest supervised\nfine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects\n(231 2D slices) are available for supervised training, SSP using MIM (DSC =\n0.86) improves over training from scratch (DSC = 0.82).\n This study found that SSP is valuable for CMR cine segmentation when labeled\ntraining data is scarce, but does not aid state-of-the-art deep learning\nmethods when ample labeled data is available. Moreover, the choice of SSP\nmethod is important. The code is publicly available at:\nhttps://github.com/q-cardIA/ssp-cmr-cine-segmentation\n","authors":["Rob A. J. de Mooij","Josien P. W. Pluim","Cian M. Scannell"],"pdf_url":"https://arxiv.org/pdf/2409.18100v1.pdf","comment":"Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.04406v2","updated":"2024-09-26T17:38:26Z","published":"2024-09-06T16:56:06Z","title":"Quantum Kernel Methods under Scrutiny: A Benchmarking Study","summary":" Since the entry of kernel theory in the field of quantum machine learning,\nquantum kernel methods (QKMs) have gained increasing attention with regard to\nboth probing promising applications and delivering intriguing research\ninsights. Two common approaches for computing the underlying Gram matrix have\nemerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs).\nBenchmarking these methods is crucial to gain robust insights and to understand\ntheir practical utility. In this work, we present a comprehensive large-scale\nstudy examining QKMs based on FQKs and PQKs across a manifold of design\nchoices. Our investigation encompasses both classification and regression tasks\nfor five dataset families and 64 datasets, systematically comparing the use of\nFQKs and PQKs quantum support vector machines and kernel ridge regression. This\nresulted in over 20,000 models that were trained and optimized using a\nstate-of-the-art hyperparameter search to ensure robust and comprehensive\ninsights. We delve into the importance of hyperparameters on model performance\nscores and support our findings through rigorous correlation analyses. In this,\nwe also closely inspect two data encoding strategies. Moreover, we provide an\nin-depth analysis addressing the design freedom of PQKs and explore the\nunderlying principles responsible for learning. Our goal is not to identify the\nbest-performing model for a specific task but to uncover the mechanisms that\nlead to effective QKMs and reveal universal patterns.\n","authors":["Jan Schnabel","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2409.04406v2.pdf","comment":"18 pages main text including 12 figures and 1 table, appendix 14\n pages with 19 figures and 1 table; restructure result section and prune\n appendix"},{"id":"http://arxiv.org/abs/2409.18073v1","updated":"2024-09-26T17:19:49Z","published":"2024-09-26T17:19:49Z","title":"Infer Human's Intentions Before Following Natural Language Instructions","summary":" For AI agents to be helpful to humans, they should be able to follow natural\nlanguage instructions to complete everyday cooperative tasks in human\nenvironments. However, real human instructions inherently possess ambiguity,\nbecause the human speakers assume sufficient prior knowledge about their hidden\ngoals and intentions. Standard language grounding and planning methods fail to\naddress such ambiguities because they do not model human internal goals as\nadditional partially observable factors in the environment. We propose a new\nframework, Follow Instructions with Social and Embodied Reasoning (FISER),\naiming for better natural language instruction following in collaborative\nembodied tasks. Our framework makes explicit inferences about human goals and\nintentions as intermediate reasoning steps. We implement a set of\nTransformer-based models and evaluate them over a challenging benchmark,\nHandMeThat. We empirically demonstrate that using social reasoning to\nexplicitly infer human intentions before making action plans surpasses purely\nend-to-end approaches. We also compare our implementation with strong\nbaselines, including Chain of Thought prompting on the largest available\npre-trained language models, and find that FISER provides better performance on\nthe embodied social reasoning tasks under investigation, reaching the\nstate-of-the-art on HandMeThat.\n","authors":["Yanming Wan","Yue Wu","Yiping Wang","Jiayuan Mao","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2409.18073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18061v1","updated":"2024-09-26T17:01:41Z","published":"2024-09-26T17:01:41Z","title":"Optimal Protocols for Continual Learning via Statistical Physics and\n Control Theory","summary":" Artificial neural networks often struggle with catastrophic forgetting when\nlearning multiple tasks sequentially, as training on new tasks degrades the\nperformance on previously learned ones. Recent theoretical work has addressed\nthis issue by analysing learning curves in synthetic frameworks under\npredefined training protocols. However, these protocols relied on heuristics\nand lacked a solid theoretical foundation assessing their optimality. In this\npaper, we fill this gap combining exact equations for training dynamics,\nderived using statistical physics techniques, with optimal control methods. We\napply this approach to teacher-student models for continual learning and\nmulti-task problems, obtaining a theory for task-selection protocols maximising\nperformance while minimising forgetting. Our theoretical analysis offers\nnon-trivial yet interpretable strategies for mitigating catastrophic\nforgetting, shedding light on how optimal learning protocols can modulate\nestablished effects, such as the influence of task similarity on forgetting.\nFinally, we validate our theoretical findings on real-world data.\n","authors":["Francesco Mori","Stefano Sarao Mannelli","Francesca Mignacco"],"pdf_url":"https://arxiv.org/pdf/2409.18061v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.18051v1","updated":"2024-09-26T16:55:31Z","published":"2024-09-26T16:55:31Z","title":"Inverse Reinforcement Learning with Multiple Planning Horizons","summary":" In this work, we study an inverse reinforcement learning (IRL) problem where\nthe experts are planning under a shared reward function but with different,\nunknown planning horizons. Without the knowledge of discount factors, the\nreward function has a larger feasible solution set, which makes it harder for\nexisting IRL approaches to identify a reward function. To overcome this\nchallenge, we develop algorithms that can learn a global multi-agent reward\nfunction with agent-specific discount factors that reconstruct the expert\npolicies. We characterize the feasible solution space of the reward function\nand discount factors for both algorithms and demonstrate the generalizability\nof the learned reward function across multiple domains.\n","authors":["Jiayu Yao","Weiwei Pan","Finale Doshi-Velez","Barbara E Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2409.18051v1.pdf","comment":"Accepted at RLC 2024"},{"id":"http://arxiv.org/abs/2409.18049v1","updated":"2024-09-26T16:49:58Z","published":"2024-09-26T16:49:58Z","title":"Revisit Anything: Visual Place Recognition via Image Segment Retrieval","summary":" Accurately recognizing a revisited place is crucial for embodied agents to\nlocalize and navigate. This requires visual representations to be distinct,\ndespite strong variations in camera viewpoint and scene appearance. Existing\nvisual place recognition pipelines encode the \"whole\" image and search for\nmatches. This poses a fundamental challenge in matching two images of the same\nplace captured from different camera viewpoints: \"the similarity of what\noverlaps can be dominated by the dissimilarity of what does not overlap\". We\naddress this by encoding and searching for \"image segments\" instead of the\nwhole images. We propose to use open-set image segmentation to decompose an\nimage into `meaningful' entities (i.e., things and stuff). This enables us to\ncreate a novel image representation as a collection of multiple overlapping\nsubgraphs connecting a segment with its neighboring segments, dubbed\nSuperSegment. Furthermore, to efficiently encode these SuperSegments into\ncompact vector representations, we propose a novel factorized representation of\nfeature aggregation. We show that retrieving these partial representations\nleads to significantly higher recognition recall than the typical whole image\nbased retrieval. Our segments-based approach, dubbed SegVLAD, sets a new\nstate-of-the-art in place recognition on a diverse selection of benchmark\ndatasets, while being applicable to both generic and task-specialized image\nencoders. Finally, we demonstrate the potential of our method to ``revisit\nanything'' by evaluating our method on an object instance retrieval task, which\nbridges the two disparate areas of research: visual place recognition and\nobject-goal navigation, through their common aim of recognizing goal objects\nspecific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything.\n","authors":["Kartik Garg","Sai Shubodh Puligilla","Shishir Kolathaya","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2409.18049v1.pdf","comment":"Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures"},{"id":"http://arxiv.org/abs/2408.11974v2","updated":"2024-09-26T16:48:34Z","published":"2024-08-21T20:14:54Z","title":"Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax\n Optimization","summary":" We provide a unified analysis of two-timescale gradient descent ascent\n(TTGDA) for solving structured nonconvex minimax optimization problems in the\nform of $\\min_\\textbf{x} \\max_{\\textbf{y} \\in Y} f(\\textbf{x}, \\textbf{y})$,\nwhere the objective function $f(\\textbf{x}, \\textbf{y})$ is nonconvex in\n$\\textbf{x}$ and concave in $\\textbf{y}$, and the constraint set $Y \\subseteq\n\\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the\nsingle-timescale gradient descent ascent (GDA) algorithm is widely used in\napplications and has been shown to have strong convergence guarantees. In more\ngeneral settings, however, it can fail to converge. Our contribution is to\ndesign TTGDA algorithms that are effective beyond the convex-concave setting,\nefficiently finding a stationary point of the function $\\Phi(\\cdot) :=\n\\max_{\\textbf{y} \\in Y} f(\\cdot, \\textbf{y})$. We also establish theoretical\nbounds on the complexity of solving both smooth and nonsmooth nonconvex-concave\nminimax optimization problems. To the best of our knowledge, this is the first\nsystematic analysis of TTGDA for nonconvex minimax optimization, shedding light\non its superior performance in training generative adversarial networks (GANs)\nand in other real-world application problems.\n","authors":["Tianyi Lin","Chi Jin","Michael. I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2408.11974v2.pdf","comment":"A preliminary version [arXiv:1906.00331] of this paper, with a subset\n of the results that are presented here, was presented at ICML 2020; 44 Pages,\n 10 Figures"},{"id":"http://arxiv.org/abs/2409.18046v1","updated":"2024-09-26T16:47:32Z","published":"2024-09-26T16:47:32Z","title":"IFCap: Image-like Retrieval and Frequency-based Entity Filtering for\n Zero-shot Captioning","summary":" Recent advancements in image captioning have explored text-only training\nmethods to overcome the limitations of paired image-text data. However,\nexisting text-only training methods often overlook the modality gap between\nusing text data during training and employing images during inference. To\naddress this issue, we propose a novel approach called Image-like Retrieval,\nwhich aligns text features with visually relevant features to mitigate the\nmodality gap. Our method further enhances the accuracy of generated captions by\ndesigning a Fusion Module that integrates retrieved captions with input\nfeatures. Additionally, we introduce a Frequency-based Entity Filtering\ntechnique that significantly improves caption quality. We integrate these\nmethods into a unified framework, which we refer to as IFCap\n($\\textbf{I}$mage-like Retrieval and $\\textbf{F}$requency-based Entity\nFiltering for Zero-shot $\\textbf{Cap}$tioning). Through extensive\nexperimentation, our straightforward yet powerful approach has demonstrated its\nefficacy, outperforming the state-of-the-art methods by a significant margin in\nboth image captioning and video captioning compared to zero-shot captioning\nbased on text-only training.\n","authors":["Soeun Lee","Si-Woo Kim","Taewhan Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2409.18046v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.16626v2","updated":"2024-09-26T16:41:27Z","published":"2024-09-25T05:11:58Z","title":"Ascend HiFloat8 Format for Deep Learning","summary":" This preliminary white paper proposes a novel 8-bit floating-point data\nformat HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered\nprecision. For normal value encoding, it provides 7 exponent values with 3-bit\nmantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with\n1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7\nextra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades).\nMeanwhile, HiF8 encodes all the special values except that positive zero and\nnegative zero are represented by only one bit-pattern. Thanks to the better\nbalance between precision and dynamic range, HiF8 can be simultaneously used in\nboth forward and backward passes of AI training. In this paper, we will\ndescribe the definition and rounding methods of HiF8, as well as the tentative\ntraining and inference solutions. To demonstrate the efficacy of HiF8, massive\nsimulation results on various neural networks, including traditional neural\nnetworks and large language models (LLMs), will also be presented.\n","authors":["Yuanyong Luo","Zhongxing Zhang","Richard Wu","Hu Liu","Ying Jin","Kai Zheng","Minmin Wang","Zhanying He","Guipeng Hu","Luyao Chen","Tianchi Hu","Junsong Wang","Minqi Chen","Mikhaylov Dmitry","Korviakov Vladimir","Bobrin Maxim","Yuhao Hu","Guanfu Chen","Zeyi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.16626v2.pdf","comment":"13 Pages, 4 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2409.18032v1","updated":"2024-09-26T16:38:48Z","published":"2024-09-26T16:38:48Z","title":"FlowBench: A Large Scale Benchmark for Flow Simulation over Complex\n Geometries","summary":" Simulating fluid flow around arbitrary shapes is key to solving various\nengineering problems. However, simulating flow physics across complex\ngeometries remains numerically challenging and computationally\nresource-intensive, particularly when using conventional PDE solvers. Machine\nlearning methods offer attractive opportunities to create fast and adaptable\nPDE solvers. However, benchmark datasets to measure the performance of such\nmethods are scarce, especially for flow physics across complex geometries. We\nintroduce FlowBench, a dataset for neural simulators with over 10K samples,\nwhich is currently larger than any publicly available flow physics dataset.\nFlowBench contains flow simulation data across complex geometries\n(\\textit{parametric vs. non-parametric}), spanning a range of flow conditions\n(\\textit{Reynolds number and Grashoff number}), capturing a diverse array of\nflow phenomena (\\textit{steady vs. transient; forced vs. free convection}), and\nfor both 2D and 3D. FlowBench contains over 10K data samples, with each sample\nthe outcome of a fully resolved, direct numerical simulation using a\nwell-validated simulator framework designed for modeling transport phenomena in\ncomplex geometries. For each sample, we include velocity, pressure, and\ntemperature field data at 3 different resolutions and several summary\nstatistics features of engineering relevance (such as coefficients of lift and\ndrag, and Nusselt numbers). %Additionally, we include masks and signed distance\nfields for each shape. We envision that FlowBench will enable evaluating the\ninterplay between complex geometry, coupled flow phenomena, and data\nsufficiency on the performance of current, and future, neural PDE solvers. We\nenumerate several evaluation metrics to help rank order the performance of\nneural PDE solvers. We benchmark the performance of several baseline methods\nincluding FNO, CNO, WNO, and DeepONet.\n","authors":["Ronak Tali","Ali Rabeh","Cheng-Hau Yang","Mehdi Shadkhah","Samundra Karki","Abhisek Upadhyaya","Suriya Dhakshinamoorthy","Marjan Saadati","Soumik Sarkar","Adarsh Krishnamurthy","Chinmay Hegde","Aditya Balu","Baskar Ganapathysubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.18032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15126v5","updated":"2024-09-26T16:38:32Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) is crucial in various fields such as materials\nscience, chemistry, and pharmacology to name a few. Conventional MD software\nstruggles with the balance between time cost and prediction accuracy, which\nrestricts its wider application. Recently, data-driven approaches based on deep\ngenerative models have been devised for time-coarsened dynamics, which aim at\nlearning dynamics of diverse molecular systems over a long timestep, enjoying\nboth universality and efficiency. Nevertheless, most current methods are\ndesigned solely to learn from the data distribution regardless of the\nunderlying Boltzmann distribution, and the physics priors such as energies and\nforces are constantly overlooked. In this work, we propose a conditional\ngenerative model called Force-guided Bridge Matching (FBM), which learns\nfull-atom time-coarsened dynamics and targets the Boltzmann-constrained\ndistribution. With the guidance of our delicately-designed intermediate force\nfield, FBM leverages favourable physics priors into the generation process,\ngiving rise to enhanced simulations. Experiments on two datasets consisting of\npeptides verify our superiority in terms of comprehensive metrics and\ndemonstrate transferability to unseen systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18025v1","updated":"2024-09-26T16:32:19Z","published":"2024-09-26T16:32:19Z","title":"An Adversarial Perspective on Machine Unlearning for AI Safety","summary":" Large language models are finetuned to refuse questions about hazardous\nknowledge, but these protections can often be bypassed. Unlearning methods aim\nat completely removing hazardous capabilities from models and make them\ninaccessible to adversaries. This work challenges the fundamental differences\nbetween unlearning and traditional safety post-training from an adversarial\nperspective. We demonstrate that existing jailbreak methods, previously\nreported as ineffective against unlearning, can be successful when applied\ncarefully. Furthermore, we develop a variety of adaptive methods that recover\nmost supposedly unlearned capabilities. For instance, we show that finetuning\non 10 unrelated examples or removing specific directions in the activation\nspace can recover most hazardous capabilities for models edited with RMU, a\nstate-of-the-art unlearning method. Our findings challenge the robustness of\ncurrent unlearning approaches and question their advantages over safety\ntraining.\n","authors":["Jakub Łucki","Boyi Wei","Yangsibo Huang","Peter Henderson","Florian Tramèr","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2409.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18013v1","updated":"2024-09-26T16:22:08Z","published":"2024-09-26T16:22:08Z","title":"Spatiotemporal Learning on Cell-embedded Graphs","summary":" Data-driven simulation of physical systems has recently kindled significant\nattention, where many neural models have been developed. In particular,\nmesh-based graph neural networks (GNNs) have demonstrated significant potential\nin predicting spatiotemporal dynamics across arbitrary geometric domains.\nHowever, the existing node-edge message passing mechanism in GNNs limits the\nmodel's representation learning ability. In this paper, we proposed a\ncell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with\nlifted performance. Specifically, we introduce a learnable cell attribution to\nthe node-edge message passing process, which better captures the spatial\ndependency of regional features. Such a strategy essentially upgrades the local\naggregation scheme from the first order (e.g., from edge to node) to a higher\norder (e.g., from volume to edge and then to node), which takes advantage of\nvolumetric information in message passing. Meanwhile, a novel feature-enhanced\nblock is designed to further improve the performance of CeGNN and relieve the\nover-smoothness problem, via treating the latent features as basis functions.\nThe extensive experiments on various PDE systems and one real-world dataset\ndemonstrate that CeGNN achieves superior performance compared with other\nbaseline models, particularly reducing the prediction error with up to 1 orders\nof magnitude on several PDE systems.\n","authors":["Yuan Mi","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.18013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18000v1","updated":"2024-09-26T16:09:19Z","published":"2024-09-26T16:09:19Z","title":"Safe Time-Varying Optimization based on Gaussian Processes with\n Spatio-Temporal Kernel","summary":" Ensuring safety is a key aspect in sequential decision making problems, such\nas robotics or process control. The complexity of the underlying systems often\nmakes finding the optimal decision challenging, especially when the\nsafety-critical system is time-varying. Overcoming the problem of optimizing an\nunknown time-varying reward subject to unknown time-varying safety constraints,\nwe propose TVSafeOpt, a new algorithm built on Bayesian optimization with a\nspatio-temporal kernel. The algorithm is capable of safely tracking a\ntime-varying safe region without the need for explicit change detection.\nOptimality guarantees are also provided for the algorithm when the optimization\nproblem becomes stationary. We show that TVSafeOpt compares favorably against\nSafeOpt on synthetic data, both regarding safety and optimality. Evaluation on\na realistic case study with gas compressors confirms that TVSafeOpt ensures\nsafety when solving time-varying optimization problems with unknown reward and\nsafety functions.\n","authors":["Jialin Li","Marta Zagorowska","Giulia De Pasquale","Alisa Rupenyan","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2409.18000v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17996v1","updated":"2024-09-26T16:07:24Z","published":"2024-09-26T16:07:24Z","title":"PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless\n Imaging","summary":" Lensless cameras offer significant advantages in size, weight, and cost\ncompared to traditional lens-based systems. Without a focusing lens, lensless\ncameras rely on computational algorithms to recover the scenes from multiplexed\nmeasurements. However, current algorithms struggle with inaccurate forward\nimaging models and insufficient priors to reconstruct high-quality images. To\novercome these limitations, we introduce a novel two-stage approach for\nconsistent and photorealistic lensless image reconstruction. The first stage of\nour approach ensures data consistency by focusing on accurately reconstructing\nthe low-frequency content with a spatially varying deconvolution method that\nadjusts to changes in the Point Spread Function (PSF) across the camera's field\nof view. The second stage enhances photorealism by incorporating a generative\nprior from pre-trained diffusion models. By conditioning on the low-frequency\ncontent retrieved in the first stage, the diffusion model effectively\nreconstructs the high-frequency details that are typically lost in the lensless\nimaging process, while also maintaining image fidelity. Our method achieves a\nsuperior balance between data fidelity and visual quality compared to existing\nmethods, as demonstrated with two popular lensless systems, PhlatCam and\nDiffuserCam. Project website: https://phocolens.github.io/.\n","authors":["Xin Cai","Zhiyuan You","Hailong Zhang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.17996v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17995v1","updated":"2024-09-26T16:07:20Z","published":"2024-09-26T16:07:20Z","title":"Joint Localization and Planning using Diffusion","summary":" Diffusion models have been successfully applied to robotics problems such as\nmanipulation and vehicle path planning. In this work, we explore their\napplication to end-to-end navigation -- including both perception and planning\n-- by considering the problem of jointly performing global localization and\npath planning in known but arbitrary 2D environments. In particular, we\nintroduce a diffusion model which produces collision-free paths in a global\nreference frame given an egocentric LIDAR scan, an arbitrary map, and a desired\ngoal position. To this end, we implement diffusion in the space of paths in\nSE(2), and describe how to condition the denoising process on both obstacles\nand sensor observations. In our evaluation, we show that the proposed\nconditioning techniques enable generalization to realistic maps of considerably\ndifferent appearance than the training environment, demonstrate our model's\nability to accurately describe ambiguous solutions, and run extensive\nsimulation experiments showcasing our model's use as a real-time, end-to-end\nlocalization and planning stack.\n","authors":["L. Lao Beyer","S. Karaman"],"pdf_url":"https://arxiv.org/pdf/2409.17995v1.pdf","comment":"7 pages, 9 figures. Submitted to ICRA 2025, under review"},{"id":"http://arxiv.org/abs/2405.15618v2","updated":"2024-09-26T16:05:30Z","published":"2024-05-24T15:04:36Z","title":"MLPs Learn In-Context on Regression and Classification Tasks","summary":" In-context learning (ICL), the remarkable ability to solve a task from only\ninput exemplars, is often assumed to be a unique hallmark of Transformer\nmodels. By examining commonly employed synthetic ICL tasks, we demonstrate that\nmulti-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and\nthe closely related MLP-Mixer models, learn in-context competitively with\nTransformers given the same compute budget in this setting. We further show\nthat MLPs outperform Transformers on a series of classical tasks from\npsychology designed to test relational reasoning, which are closely related to\nin-context classification. These results underscore a need for studying\nin-context learning beyond attention-based architectures, while also\nchallenging strong prior arguments about MLPs' limited ability to solve\nrelational tasks. Altogether, our results highlight the unexpected competence\nof MLPs, and support the growing interest in all-MLP alternatives to\ntask-specific architectures.\n","authors":["William L. Tong","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2405.15618v2.pdf","comment":"30 pages, 10 figures, code available at\n https://github.com/wtong98/mlp-icl"},{"id":"http://arxiv.org/abs/2409.17992v1","updated":"2024-09-26T16:02:25Z","published":"2024-09-26T16:02:25Z","title":"LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged\n Robots","summary":" Reinforcement Learning (RL) has shown its remarkable and generalizable\ncapability in legged locomotion through sim-to-real transfer. However, while\nadaptive methods like domain randomization are expected to make policy more\nrobust to diverse environments, such comprehensiveness potentially detracts\nfrom the policy's performance in any specific environment according to the No\nFree Lunch theorem, leading to a suboptimal solution once deployed in the real\nworld. To address this issue, we propose a lifelong policy adaptation framework\nnamed LoopSR, which utilizes a transformer-based encoder to project real-world\ntrajectories into a latent space, and accordingly reconstruct the real-world\nenvironments back in simulation for further improvement. Autoencoder\narchitecture and contrastive learning methods are adopted to better extract the\ncharacteristics of real-world dynamics. The simulation parameters for continual\ntraining are derived by combining predicted parameters from the decoder with\nretrieved parameters from the simulation trajectory dataset. By leveraging the\ncontinual training, LoopSR achieves superior data efficiency compared with\nstrong baselines, with only a limited amount of data to yield eminent\nperformance in both sim-to-sim and sim-to-real experiments.\n","authors":["Peilin Wu","Weiji Xie","Jiahang Cao","Hang Lai","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17992v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.17991v1","updated":"2024-09-26T16:02:13Z","published":"2024-09-26T16:02:13Z","title":"Dimension-independent learning rates for high-dimensional classification\n problems","summary":" We study the problem of approximating and estimating classification functions\nthat have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$\ntype arise naturally as solutions of regularized neural network learning\nproblems and neural networks can approximate these functions without the curse\nof dimensionality. We modify existing results to show that every $RBV^2$\nfunction can be approximated by a neural network with bounded weights.\nThereafter, we prove the existence of a neural network with bounded weights\napproximating a classification function. And we leverage these bounds to\nquantify the estimation rates. Finally, we present a numerical study that\nanalyzes the effect of different regularity conditions on the decision\nboundaries.\n","authors":["Andres Felipe Lerma-Pineda","Philipp Petersen","Simon Frieder","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2409.17991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17986v1","updated":"2024-09-26T15:56:40Z","published":"2024-09-26T15:56:40Z","title":"Supra-Laplacian Encoding for Transformer on Dynamic Graphs","summary":" Fully connected Graph Transformers (GT) have rapidly become prominent in the\nstatic graph community as an alternative to Message-Passing models, which\nsuffer from a lack of expressivity, oversquashing, and under-reaching. However,\nin a dynamic context, by interconnecting all nodes at multiple snapshots with\nself-attention, GT loose both structural and temporal information. In this\nwork, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs\n(SLATE), a new spatio-temporal encoding to leverage the GT architecture while\nkeeping spatio-temporal information. Specifically, we transform Discrete Time\nDynamic Graphs into multi-layer graphs and take advantage of the spectral\nproperties of their associated supra-Laplacian matrix. Our second contribution\nexplicitly model nodes' pairwise relationships with a cross-attention\nmechanism, providing an accurate edge representation for dynamic link\nprediction. SLATE outperforms numerous state-of-the-art methods based on\nMessage-Passing Graph Neural Networks combined with recurrent models (e.g\nLSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to\nreproduce our results will be open-sourced.\n","authors":["Yannis Karmim","Marc Lafon","Raphaël Fournier S'niehotta","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2409.17986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15244v2","updated":"2024-09-26T15:56:30Z","published":"2024-03-22T14:40:29Z","title":"A Stochastic Quasi-Newton Method for Non-convex Optimization with\n Non-uniform Smoothness","summary":" Classical convergence analyses for optimization algorithms rely on the\nwidely-adopted uniform smoothness assumption. However, recent experimental\nstudies have demonstrated that many machine learning problems exhibit\nnon-uniform smoothness, meaning the smoothness factor is a function of the\nmodel parameter instead of a universal constant. In particular, it has been\nobserved that the smoothness grows with respect to the gradient norm along the\ntraining trajectory. Motivated by this phenomenon, the recently introduced\n$(L_0, L_1)$-smoothness is a more general notion, compared to traditional\n$L$-smoothness, that captures such positive relationship between smoothness and\ngradient norm. Under this type of non-uniform smoothness, existing literature\nhas designed stochastic first-order algorithms by utilizing gradient clipping\ntechniques to obtain the optimal $\\mathcal{O}(\\epsilon^{-3})$ sample complexity\nfor finding an $\\epsilon$-approximate first-order stationary solution.\nNevertheless, the studies of quasi-Newton methods are still lacking.\nConsidering higher accuracy and more robustness for quasi-Newton methods, in\nthis paper we propose a fast stochastic quasi-Newton method when there exists\nnon-uniformity in smoothness. Leveraging gradient clipping and variance\nreduction, our algorithm can achieve the best-known\n$\\mathcal{O}(\\epsilon^{-3})$ sample complexity and enjoys convergence speedup\nwith simple hyperparameter tuning. Our numerical experiments show that our\nproposed algorithm outperforms the state-of-the-art approaches.\n","authors":["Zhenyu Sun","Ermin Wei"],"pdf_url":"https://arxiv.org/pdf/2403.15244v2.pdf","comment":"Paper accepted by CDC 2024"},{"id":"http://arxiv.org/abs/2409.17985v1","updated":"2024-09-26T15:55:59Z","published":"2024-09-26T15:55:59Z","title":"Hypergame Theory for Decentralized Resource Allocation in Multi-user\n Semantic Communications","summary":" Semantic communications (SC) is an emerging communication paradigm in which\nwireless devices can send only relevant information from a source of data while\nrelying on computing resources to regenerate missing data points. However, the\ndesign of a multi-user SC system becomes more challenging because of the\ncomputing and communication overhead required for coordination. Existing\nsolutions for learning the semantic language and performing resource allocation\noften fail to capture the computing and communication tradeoffs involved in\nmultiuser SC. To address this gap, a novel framework for decentralized\ncomputing and communication resource allocation in multiuser SC systems is\nproposed. The challenge of efficiently allocating communication and computing\nresources (for reasoning) in a decentralized manner to maximize the quality of\ntask experience for the end users is addressed through the application of\nStackelberg hyper game theory. Leveraging the concept of second-level hyper\ngames, novel analytical formulations are developed to model misperceptions of\nthe users about each other's communication and control strategies. Further,\nequilibrium analysis of the learned resource allocation protocols examines the\nconvergence of the computing and communication strategies to a local\nStackelberg equilibria, considering misperceptions. Simulation results show\nthat the proposed Stackelberg hyper game results in efficient usage of\ncommunication and computing resources while maintaining a high quality of\nexperience for the users compared to state-of-the-art that does not account for\nthe misperceptions.\n","authors":["Christo Kurisummoottil Thomas","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2409.17985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15059v2","updated":"2024-09-26T15:53:10Z","published":"2024-05-23T21:17:20Z","title":"Message-Passing Monte Carlo: Generating low-discrepancy point sets via\n Graph Neural Networks","summary":" Discrepancy is a well-known measure for the irregularity of the distribution\nof a point set. Point sets with small discrepancy are called low-discrepancy\nand are known to efficiently fill the space in a uniform manner.\nLow-discrepancy points play a central role in many problems in science and\nengineering, including numerical integration, computer vision, machine\nperception, computer graphics, machine learning, and simulation. In this work,\nwe present the first machine learning approach to generate a new class of\nlow-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points.\nMotivated by the geometric nature of generating low-discrepancy point sets, we\nleverage tools from Geometric Deep Learning and base our model on Graph Neural\nNetworks. We further provide an extension of our framework to higher\ndimensions, which flexibly allows the generation of custom-made points that\nemphasize the uniformity in specific dimensions that are primarily important\nfor the particular problem at hand. Finally, we demonstrate that our proposed\nmodel achieves state-of-the-art performance superior to previous methods by a\nsignificant margin. In fact, MPMC points are empirically shown to be either\noptimal or near-optimal with respect to the discrepancy for low dimension and\nsmall number of points, i.e., for which the optimal discrepancy can be\ndetermined. Code for generating MPMC points can be found at\nhttps://github.com/tk-rusch/MPMC.\n","authors":["T. Konstantin Rusch","Nathan Kirk","Michael M. Bronstein","Christiane Lemieux","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2405.15059v2.pdf","comment":"Published in Proceedings of the National Academy of Sciences (PNAS):\n https://www.pnas.org/doi/10.1073/pnas.2409913121"},{"id":"http://arxiv.org/abs/2409.17978v1","updated":"2024-09-26T15:52:36Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17972v1","updated":"2024-09-26T15:47:42Z","published":"2024-09-26T15:47:42Z","title":"BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and\n Adaptive Disambiguate based Efficient Tree Search","summary":" Large Language Models (LLMs) have exhibited exceptional performance across a\nbroad range of tasks and domains. However, they still encounter difficulties in\nsolving mathematical problems due to the rigorous and logical nature of\nmathematics. Previous studies have employed techniques such as supervised\nfine-tuning (SFT), prompt engineering, and search-based methods to improve the\nmathematical problem-solving abilities of LLMs. Despite these efforts, their\nperformance remains suboptimal and demands substantial computational resources.\nTo address this issue, we propose a novel approach, BEATS, to enhance\nmathematical problem-solving abilities. Our method leverages newly designed\nprompts that guide the model to iteratively rewrite, advance by one step, and\ngenerate answers based on previous steps. Additionally, we introduce a new\nback-verification technique that uses LLMs to validate the correctness of the\ngenerated answers. Furthermore, we employ a pruning tree search to optimize\nsearch time while achieving strong performance. Notably, our method improves\nQwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the\nMATH benchmark.\n","authors":["Linzhuang Sun","Hao Liang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.17972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14500v2","updated":"2024-09-26T15:26:43Z","published":"2024-09-22T15:53:19Z","title":"TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with\n Tabular Node Features","summary":" Tabular machine learning is an important field for industry and science. In\nthis field, table rows are usually treated as independent data samples, but\nadditional information about relations between them is sometimes available and\ncan be used to improve predictive performance. Such information can be\nnaturally modeled with a graph, thus tabular machine learning may benefit from\ngraph machine learning methods. However, graph machine learning models are\ntypically evaluated on datasets with homogeneous node features, which have\nlittle in common with heterogeneous mixtures of numerical and categorical\nfeatures present in tabular datasets. Thus, there is a critical difference\nbetween the data used in tabular and graph machine learning studies, which does\nnot allow one to understand how successfully graph models can be transferred to\ntabular data. To bridge this gap, we propose a new benchmark of diverse graphs\nwith heterogeneous tabular node features and realistic prediction tasks. We use\nthis benchmark to evaluate a vast set of models, including simple methods\npreviously overlooked in the literature. Our experiments show that graph neural\nnetworks (GNNs) can indeed often bring gains in predictive performance for\ntabular data, but standard tabular models also can be adapted to work with\ngraph data by using simple feature preprocessing, which sometimes enables them\nto compete with and even outperform GNNs. Based on our empirical study, we\nprovide insights for researchers and practitioners in both tabular and graph\nmachine learning fields.\n","authors":["Gleb Bazhenov","Oleg Platonov","Liudmila Prokhorenkova"],"pdf_url":"https://arxiv.org/pdf/2409.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09825v2","updated":"2024-09-26T15:21:31Z","published":"2024-06-14T08:29:34Z","title":"Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of\n Anomalous Behavior in Bio-regenerative Life Support System Telemetry","summary":" The detection of abnormal or critical system states is essential in condition\nmonitoring. While much attention is given to promptly identifying anomalies, a\nretrospective analysis of these anomalies can significantly enhance our\ncomprehension of the underlying causes of observed undesired behavior. This\naspect becomes particularly critical when the monitored system is deployed in a\nvital environment. In this study, we delve into anomalies within the domain of\nBio-Regenerative Life Support Systems (BLSS) for space exploration and analyze\nanomalies found in telemetry data stemming from the EDEN ISS space greenhouse\nin Antarctica. We employ time series clustering on anomaly detection results to\ncategorize various types of anomalies in both uni- and multivariate settings.\nWe then assess the effectiveness of these methods in identifying systematic\nanomalous behavior. Additionally, we illustrate that the anomaly detection\nmethods MDI and DAMP produce complementary results, as previously indicated by\nresearch.\n","authors":["Ferdinand Rewicki","Jakob Gawlikowski","Julia Niebling","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2406.09825v2.pdf","comment":"12 pages, + Supplemental Materials, Published at Machine Learning and\n Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2409.17943v1","updated":"2024-09-26T15:18:34Z","published":"2024-09-26T15:18:34Z","title":"On Translating Technical Terminology: A Translation Workflow for\n Machine-Translated Acronyms","summary":" The typical workflow for a professional translator to translate a document\nfrom its source language (SL) to a target language (TL) is not always focused\non what many language models in natural language processing (NLP) do - predict\nthe next word in a series of words. While high-resource languages like English\nand French are reported to achieve near human parity using common metrics for\nmeasurement such as BLEU and COMET, we find that an important step is being\nmissed: the translation of technical terms, specifically acronyms. Some\nstate-of-the art machine translation systems like Google Translate which are\npublicly available can be erroneous when dealing with acronyms - as much as 50%\nin our findings. This article addresses acronym disambiguation for MT systems\nby proposing an additional step to the SL-TL (FR-EN) translation workflow where\nwe first offer a new acronym corpus for public consumption and then experiment\nwith a search-based thresholding algorithm that achieves nearly 10% increase\nwhen compared to Google Translate and OpusMT.\n","authors":["Richard Yue","John E. Ortega","Kenneth Ward Church"],"pdf_url":"https://arxiv.org/pdf/2409.17943v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17939v1","updated":"2024-09-26T15:12:59Z","published":"2024-09-26T15:12:59Z","title":"Predicting Anchored Text from Translation Memories for Machine\n Translation Using Deep Learning Methods","summary":" Translation memories (TMs) are the backbone for professional translation\ntools called computer-aided translation (CAT) tools. In order to perform a\ntranslation using a CAT tool, a translator uses the TM to gather translations\nsimilar to the desired segment to translate (s'). Many CAT tools offer a\nfuzzy-match algorithm to locate segments (s) in the TM that are close in\ndistance to s'. After locating two similar segments, the CAT tool will present\nparallel segments (s, t) that contain one segment in the source language along\nwith its translation in the target language. Additionally, CAT tools contain\nfuzzy-match repair (FMR) techniques that will automatically use the parallel\nsegments from the TM to create new TM entries containing a modified version of\nthe original with the idea in mind that it will be the translation of s'. Most\nFMR techniques use machine translation as a way of \"repairing\" those words that\nhave to be modified. In this article, we show that for a large part of those\nwords which are anchored, we can use other techniques that are based on machine\nlearning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we\nshow that for anchored words that follow the continuous bag-of-words (CBOW)\nparadigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for\nsome cases, better results than neural machine translation for translating\nanchored words from French to English.\n","authors":["Richard Yue","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2409.17939v1.pdf","comment":"AMTA 2024 - The Association for Machine Translation in the Americas\n organizes biennial conferences devoted to researchers, commercial users,\n governmental and NGO users"},{"id":"http://arxiv.org/abs/2409.17937v1","updated":"2024-09-26T15:12:41Z","published":"2024-09-26T15:12:41Z","title":"Adaptive Stream Processing on Edge Devices through Active Inference","summary":" The current scenario of IoT is witnessing a constant increase on the volume\nof data, which is generated in constant stream, calling for novel architectural\nand logical solutions for processing it. Moving the data handling towards the\nedge of the computing spectrum guarantees better distribution of load and, in\nprinciple, lower latency and better privacy. However, managing such a structure\nis complex, especially when requirements, also referred to Service Level\nObjectives (SLOs), specified by applications' owners and infrastructure\nmanagers need to be ensured. Despite the rich number of proposals of Machine\nLearning (ML) based management solutions, researchers and practitioners yet\nstruggle to guarantee long-term prediction and control, and accurate\ntroubleshooting. Therefore, we present a novel ML paradigm based on Active\nInference (AIF) -- a concept from neuroscience that describes how the brain\nconstantly predicts and evaluates sensory information to decrease long-term\nsurprise. We implement it and evaluate it in a heterogeneous real stream\nprocessing use case, where an AIF-based agent continuously optimizes the\nfulfillment of three SLOs for three autonomous driving services running on\nmultiple devices. The agent used causal knowledge to gradually develop an\nunderstanding of how its actions are related to requirements fulfillment, and\nwhich configurations to favor. Through this approach, our agent requires up to\nthirty iterations to converge to the optimal solution, showing the capability\nof offering accurate results in a short amount of time. Furthermore, thanks to\nAIF and its causal structures, our method guarantees full transparency on the\ndecision making, making the interpretation of the results and the\ntroubleshooting effortless.\n","authors":["Boris Sedlak","Victor Casamayor Pujol","Andrea Morichetta","Praveen Kumar Donta","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2409.17937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17932v1","updated":"2024-09-26T15:08:52Z","published":"2024-09-26T15:08:52Z","title":"Sample compression unleashed : New generalization bounds for real valued\n losses","summary":" The sample compression theory provides generalization guarantees for\npredictors that can be fully defined using a subset of the training dataset and\na (short) message string, generally defined as a binary sequence. Previous\nworks provided generalization bounds for the zero-one loss, which is\nrestrictive, notably when applied to deep learning approaches. In this paper,\nwe present a general framework for deriving new sample compression bounds that\nhold for real-valued losses. We empirically demonstrate the tightness of the\nbounds and their versatility by evaluating them on different types of models,\ne.g., neural networks and decision forests, trained with the Pick-To-Learn\n(P2L) meta-algorithm, which transforms the training method of any\nmachine-learning predictor to yield sample-compressed predictors. In contrast\nto existing P2L bounds, ours are valid in the non-consistent case.\n","authors":["Mathieu Bazinet","Valentina Zantedeschi","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2409.17932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17931v1","updated":"2024-09-26T15:08:38Z","published":"2024-09-26T15:08:38Z","title":"Intelligent Energy Management: Remaining Useful Life Prediction and\n Charging Automation System Comprised of Deep Learning and the Internet of\n Things","summary":" Remaining Useful Life (RUL) of battery is an important parameter to know the\nbattery's remaining life and need for recharge. The goal of this research\nproject is to develop machine learning-based models for the battery RUL\ndataset. Different ML models are developed to classify the RUL of the vehicle,\nand the IoT (Internet of Things) concept is simulated for automating the\ncharging system and managing any faults aligning. The graphs plotted depict the\nrelationship between various vehicle parameters using the Blynk IoT platform.\nResults show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent\nUnit (GRU), and hybrid model developed could classify RUL into three classes\nwith 99% more accuracy. The data is fed using the tkinter GUI for simulating\nartificial intelligence (AI)-based charging, and with a pyserial backend, data\ncan be entered into the Esp-32 microcontroller for making charge discharge\npossible with the model's predictions. Also, with an IoT system, the charging\ncan be disconnected, monitored, and analyzed for automation. The results show\nthat an accuracy of 99% can be obtained on models MLP, catboost model and\nsimilar accuracy on GRU model can be obtained, and finally relay-based\ntriggering can be made by prediction through the model used for automating the\ncharging and energy-saving mechanism. By showcasing an exemplary Blynk\nplatform-based monitoring and automation phenomenon, we further present\ninnovative ways of monitoring parameters and automating the system.\n","authors":["Biplov Paneru","Bishwash Paneru","DP Sharma Mainali"],"pdf_url":"https://arxiv.org/pdf/2409.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15228v3","updated":"2024-09-26T14:57:52Z","published":"2024-09-23T17:22:09Z","title":"A Comprehensive Framework for Evaluating API-oriented Code Generation in\n Large Language Models","summary":" Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as\npowerful tools for code generation, significantly enhancing productivity and\naccelerating software development. However, existing benchmarks primarily focus\non general code generation without considering API-oriented code generation,\ni.e., generating code that invokes APIs from specific libraries. Given the\ngrowing demand for API-oriented code generation, there is a pressing need for a\nsystematic and automated approach to evaluate LLM on API-oriented code\ngeneration. To address this gap, we propose AutoAPIEval, a lightweight and\nautomated framework designed to evaluate the capabilities of LLMs in\nAPI-oriented code generation. Our framework works with any library that\nprovides API documentation and focuses on two unit tasks: API recommendation\nand code example generation, along with four metrics to evaluate the generated\nAPIs and code examples, such as the proportion of incorrect API recommendations\nfor Task 1, and the proportion of code examples where no specific API is\ninvoked and uncompilable/unexecutable code examples for Task 2. In addition, we\nconducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder)\nand Java Runtime Environment 8 to demonstrate the framework's effectiveness.\nOur findings reveal substantial variability in LLM performance across tasks,\nwith ChatGPT adhering better to instructions, while sharing similar\neffectiveness in code example generation with its counterparts (i.e., MagiCoder\nand DeekSeek Coder). We also identify key factors associated with code quality,\nsuch as API popularity and model confidence, and build classifiers that achieve\nhigh accuracy in detecting incorrect API recommendations and erroneous code\nexamples. Retrieval-augmented generation enhances the quality of code generated\nby LLMs, though its effectiveness varies across different LLMs.\n","authors":["Yixi Wu","Pengfei He","Zehao Wang","Shaowei Wang","Yuan Tian","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.15228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08201v2","updated":"2024-09-26T14:56:57Z","published":"2024-09-12T16:38:20Z","title":"Machine Learning for Two-Sample Testing under Right-Censored Data: A\n Simulation Study","summary":" The focus of this study is to evaluate the effectiveness of Machine Learning\n(ML) methods for two-sample testing with right-censored observations. To\nachieve this, we develop several ML-based methods with varying architectures\nand implement them as two-sample tests. Each method is an ensemble (stacking)\nthat combines predictions from classical two-sample tests. This paper presents\nthe results of training the proposed ML methods, examines their statistical\npower compared to classical two-sample tests, analyzes the null distribution of\nthe proposed methods when the null hypothesis is true, and evaluates the\nsignificance of the features incorporated into the proposed methods. In total,\nthis work covers 18 methods for two-sample testing under right-censored\nobservations, including the proposed methods and classical well-studied\ntwo-sample tests. All results from numerical experiments were obtained from a\nsynthetic dataset generated using the inverse transform sampling method and\nreplicated multiple times through Monte Carlo simulation. To test the\ntwo-sample problem with right-censored observations, one can use the proposed\ntwo-sample methods (scripts, dataset, and models are available on GitHub and\nHugging Face).\n","authors":["Petr Philonenko","Sergey Postovalov"],"pdf_url":"https://arxiv.org/pdf/2409.08201v2.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.17906v1","updated":"2024-09-26T14:52:40Z","published":"2024-09-26T14:52:40Z","title":"Graph Reasoning with Large Language Models via Pseudo-code Prompting","summary":" Large language models (LLMs) have recently achieved remarkable success in\nvarious reasoning tasks in the field of natural language processing. This\nsuccess of LLMs has also motivated their use in graph-related tasks. Among\nothers, recent work has explored whether LLMs can solve graph problems such as\ncounting the number of connected components of a graph or computing the\nshortest path distance between two nodes. Although LLMs possess preliminary\ngraph reasoning abilities, they might still struggle to solve some seemingly\nsimple problems. In this paper, we investigate whether prompting via\npseudo-code instructions can improve the performance of LLMs in solving graph\nproblems. Our experiments demonstrate that using pseudo-code instructions\ngenerally improves the performance of all considered LLMs. The graphs,\npseudo-code prompts, and evaluation code are publicly available.\n","authors":["Konstantinos Skianis","Giannis Nikolentzos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2409.17906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17902v1","updated":"2024-09-26T14:50:20Z","published":"2024-09-26T14:50:20Z","title":"Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and\n Security in IoT Devices","summary":" The rapid expansion of Internet of Things (IoT) devices demands robust and\nresource-efficient security solutions. Physically Unclonable Functions (PUFs),\nwhich generate unique cryptographic keys from inherent hardware variations,\noffer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs)\nand XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and\nreliability-based attacks. In this study, we investigate\nComponent-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored\nvariant, to address these vulnerabilities. We propose an optimized CDC-XPUF\ndesign that incorporates a pre-selection strategy to enhance reliability and\nintroduces a novel lightweight architecture to reduce hardware overhead.\nRigorous testing demonstrates that our design significantly lowers resource\nconsumption, maintains strong resistance to ML attacks, and improves\nreliability, effectively mitigating reliability-based attacks. These results\nhighlight the potential of CDC-XPUFs as a secure and efficient candidate for\nwidespread deployment in resource-constrained IoT systems.\n","authors":["Gaoxiang Li","Yu Zhuang"],"pdf_url":"https://arxiv.org/pdf/2409.17902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17896v1","updated":"2024-09-26T14:47:14Z","published":"2024-09-26T14:47:14Z","title":"Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV\n Attitude Control Under Varying Wind Conditions","summary":" This paper evaluates and compares the performance of model-free and\nmodel-based reinforcement learning for the attitude control of fixed-wing\nunmanned aerial vehicles using PID as a reference point. The comparison focuses\non their ability to handle varying flight dynamics and wind disturbances in a\nsimulated environment. Our results show that the Temporal Difference Model\nPredictive Control agent outperforms both the PID controller and other\nmodel-free reinforcement learning methods in terms of tracking accuracy and\nrobustness over different reference difficulties, particularly in nonlinear\nflight regimes. Furthermore, we introduce actuation fluctuation as a key metric\nto assess energy efficiency and actuator wear, and we test two different\napproaches from the literature: action variation penalty and conditioning for\naction policy smoothness. We also evaluate all control methods when subject to\nstochastic turbulence and gusts separately, so as to measure their effects on\ntracking performance, observe their limitations and outline their implications\non the Markov decision process formalism.\n","authors":["David Olivares","Pierre Fournier","Pavan Vasishta","Julien Marzat"],"pdf_url":"https://arxiv.org/pdf/2409.17896v1.pdf","comment":"Published at ICINCO 2024"},{"id":"http://arxiv.org/abs/2409.17889v1","updated":"2024-09-26T14:38:54Z","published":"2024-09-26T14:38:54Z","title":"A multi-source data power load forecasting method using attention\n mechanism-based parallel cnn-gru","summary":" Accurate power load forecasting is crucial for improving energy efficiency\nand ensuring power supply quality. Considering the power load forecasting\nproblem involves not only dynamic factors like historical load variations but\nalso static factors such as climate conditions that remain constant over\nspecific periods. From the model-agnostic perspective, this paper proposes a\nparallel structure network to extract important information from both dynamic\nand static data. Firstly, based on complexity learning theory, it is\ndemonstrated that models integrated through parallel structures exhibit\nsuperior generalization abilities compared to individual base learners.\nAdditionally, the higher the independence between base learners, the stronger\nthe generalization ability of the parallel structure model. This suggests that\nthe structure of machine learning models inherently contains significant\ninformation. Building on this theoretical foundation, a parallel convolutional\nneural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is\nemployed to address the power load forecasting issue, aiming to effectively\nintegrate the influences of dynamic and static features. The CNN module is\nresponsible for capturing spatial characteristics from static data, while the\nGRU module captures long-term dependencies in dynamic time series data. The\nattention layer is designed to focus on key information from the\nspatial-temporal features extracted by the parallel CNN-GRU. To substantiate\nthe advantages of the parallel structure model in extracting and integrating\nmulti-source information, a series of experiments are conducted.\n","authors":["Chao Min","Yijia Wang","Bo Zhang","Xin Ma","Junyi Cui"],"pdf_url":"https://arxiv.org/pdf/2409.17889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.12060v2","updated":"2024-09-26T14:32:36Z","published":"2021-06-22T21:15:00Z","title":"Faster Randomized Methods for Orthogonality Constrained Problems","summary":" Recent literature has advocated the use of randomized methods for\naccelerating the solution of various matrix problems arising throughout data\nscience and computational science. One popular strategy for leveraging\nrandomization is to use it as a way to reduce problem size. However, methods\nbased on this strategy lack sufficient accuracy for some applications.\nRandomized preconditioning is another approach for leveraging randomization,\nwhich provides higher accuracy. The main challenge in using randomized\npreconditioning is the need for an underlying iterative method, thus randomized\npreconditioning so far have been applied almost exclusively to solving\nregression problems and linear systems. In this article, we show how to expand\nthe application of randomized preconditioning to another important set of\nproblems prevalent across data science: optimization problems with\n(generalized) orthogonality constraints. We demonstrate our approach, which is\nbased on the framework of Riemannian optimization and Riemannian\npreconditioning, on the problem of computing the dominant canonical\ncorrelations and on the Fisher linear discriminant analysis problem. For both\nproblems, we evaluate the effect of preconditioning on the computational costs\nand asymptotic convergence, and demonstrate empirically the utility of our\napproach.\n","authors":["Boris Shustin","Haim Avron"],"pdf_url":"https://arxiv.org/pdf/2106.12060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01807v2","updated":"2024-09-26T14:21:10Z","published":"2023-10-03T05:40:56Z","title":"Discrete, compositional, and symbolic representations through attractor\n dynamics","summary":" Symbolic systems are powerful frameworks for modeling cognitive processes as\nthey encapsulate the rules and relationships fundamental to many aspects of\nhuman reasoning and behavior. Central to these models are systematicity,\ncompositionality, and productivity, making them invaluable in both cognitive\nscience and artificial intelligence. However, certain limitations remain. For\ninstance, the integration of structured symbolic processes and latent\nsub-symbolic processes has been implemented at the computational level through\nfiat methods such as quantization or softmax sampling, which assume, rather\nthan derive, the operations underpinning discretization and symbolicization. In\nthis work, we introduce a novel neural stochastic dynamical systems model that\nintegrates attractor dynamics with symbolic representations to model cognitive\nprocesses akin to the probabilistic language of thought (PLoT). Our model\nsegments the continuous representational space into discrete basins, with\nattractor states corresponding to symbolic sequences, that reflect the\nsemanticity and compositionality characteristic of symbolic systems through\nunsupervised learning, rather than relying on pre-defined primitives. Moreover,\nlike PLoT, our model learns to sample a diverse distribution of attractor\nstates that reflect the mutual information between the input data and the\nsymbolic encodings. This approach establishes a unified framework that\nintegrates both symbolic and sub-symbolic processing through neural dynamics, a\nneuro-plausible substrate with proven expressivity in AI, offering a more\ncomprehensive model that mirrors the complex duality of cognitive operations.\n","authors":["Andrew Nam","Eric Elmoznino","Nikolay Malkin","James McClelland","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2310.01807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17872v1","updated":"2024-09-26T14:19:07Z","published":"2024-09-26T14:19:07Z","title":"A method for identifying causality in the response of nonlinear\n dynamical systems","summary":" Predicting the response of nonlinear dynamical systems subject to random,\nbroadband excitation is important across a range of scientific disciplines,\nsuch as structural dynamics and neuroscience. Building data-driven models\nrequires experimental measurements of the system input and output, but it can\nbe difficult to determine whether inaccuracies in the model stem from modelling\nerrors or noise. This paper presents a novel method to identify the causal\ncomponent of the input-output data from measurements of a system in the\npresence of output noise, as a function of frequency, without needing a high\nfidelity model. An output prediction, calculated using an available model, is\noptimally combined with noisy measurements of the output to predict the input\nto the system. The parameters of the algorithm balance the two output signals\nand are utilised to calculate a nonlinear coherence metric as a measure of\ncausality. This method is applicable to a broad class of nonlinear dynamical\nsystems. There are currently no solutions to this problem in the absence of a\ncomplete benchmark model.\n","authors":["Joseph Massingham","Ole Nielsen","Tore Butlin"],"pdf_url":"https://arxiv.org/pdf/2409.17872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17870v1","updated":"2024-09-26T14:17:58Z","published":"2024-09-26T14:17:58Z","title":"Efficient Arbitrary Precision Acceleration for Large Language Models on\n GPU Tensor Cores","summary":" Large language models (LLMs) have been widely applied but face challenges in\nefficient inference. While quantization methods reduce computational demands,\nultra-low bit quantization with arbitrary precision is hindered by limited GPU\nTensor Core support and inefficient memory management, leading to suboptimal\nacceleration. To address these challenges, we propose a comprehensive\nacceleration scheme for arbitrary precision LLMs. At its core, we introduce a\nnovel bipolar-INT data format that facilitates parallel computing and supports\nsymmetric quantization, effectively reducing data redundancy. Building on this,\nwe implement an arbitrary precision matrix multiplication scheme that\ndecomposes and recovers matrices at the bit level, enabling flexible precision\nwhile maximizing GPU Tensor Core utilization. Furthermore, we develop an\nefficient matrix preprocessing method that optimizes data layout for subsequent\ncomputations. Finally, we design a data recovery-oriented memory management\nsystem that strategically utilizes fast shared memory, significantly enhancing\nkernel execution speed and minimizing memory access latency. Experimental\nresults demonstrate our approach's effectiveness, with up to 13\\times speedup\nin matrix multiplication compared to NVIDIA's CUTLASS. When integrated into\nLLMs, we achieve up to 6.7\\times inference acceleration. These improvements\nsignificantly enhance LLM inference efficiency, enabling broader and more\nresponsive applications of LLMs.\n","authors":["Shaobo Ma","Chao Fang","Haikuo Shao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05208v3","updated":"2024-09-26T14:16:01Z","published":"2023-10-08T15:49:36Z","title":"ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot\n Coordination","summary":" Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement\nlearning (MARL) challenge that aims to train an ego agent to work with diverse,\nunseen partners during deployment. The significant difference between the\ndeployment-time partners' distribution and the training partners' distribution\ndetermined by the training algorithm makes ZSC a unique out-of-distribution\n(OOD) generalization challenge. The potential distribution gap between\nevaluation and deployment-time partners leads to inadequate evaluation, which\nis exacerbated by the lack of appropriate evaluation metrics. In this paper, we\npresent ZSC-Eval, the first evaluation toolkit and benchmark for ZSC\nalgorithms. ZSC-Eval consists of: 1) Generation of evaluation partner\ncandidates through behavior-preferring rewards to approximate deployment-time\npartners' distribution; 2) Selection of evaluation partners by Best-Response\nDiversity (BR-Div); 3) Measurement of generalization performance with various\nevaluation partners via the Best-Response Proximity (BR-Prox) metric. We use\nZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football\nenvironments and get novel empirical findings. We also conduct a human\nexperiment of current ZSC algorithms to verify the ZSC-Eval's consistency with\nhuman evaluation. ZSC-Eval is now available at\nhttps://github.com/sjtu-marl/ZSC-Eval.\n","authors":["Xihuai Wang","Shao Zhang","Wenhao Zhang","Wentao Dong","Jingxiao Chen","Ying Wen","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05208v3.pdf","comment":"Accepted in NeurIPS 2024 Dataset and Benchmark Track"},{"id":"http://arxiv.org/abs/2409.17865v1","updated":"2024-09-26T14:15:54Z","published":"2024-09-26T14:15:54Z","title":"Implementing a Nordic-Baltic Federated Health Data Network: a case\n report","summary":" Background: Centralized collection and processing of healthcare data across\nnational borders pose significant challenges, including privacy concerns, data\nheterogeneity and legal barriers. To address some of these challenges, we\nformed an interdisciplinary consortium to develop a feder-ated health data\nnetwork, comprised of six institutions across five countries, to facilitate\nNordic-Baltic cooperation on secondary use of health data. The objective of\nthis report is to offer early insights into our experiences developing this\nnetwork. Methods: We used a mixed-method ap-proach, combining both experimental\ndesign and implementation science to evaluate the factors affecting the\nimplementation of our network. Results: Technically, our experiments indicate\nthat the network functions without significant performance degradation compared\nto centralized simu-lation. Conclusion: While use of interdisciplinary\napproaches holds a potential to solve challeng-es associated with establishing\nsuch collaborative networks, our findings turn the spotlight on the uncertain\nregulatory landscape playing catch up and the significant operational costs.\n","authors":["Taridzo Chomutare","Aleksandar Babic","Laura-Maria Peltonen","Silja Elunurm","Peter Lundberg","Arne Jönsson","Emma Eneling","Ciprian-Virgil Gerstenberger","Troels Siggaard","Raivo Kolde","Oskar Jerdhaf","Martin Hansson","Alexandra Makhlysheva","Miroslav Muzny","Erik Ylipää","Søren Brunak","Hercules Dalianis"],"pdf_url":"https://arxiv.org/pdf/2409.17865v1.pdf","comment":"24 pages (including appendices), 1 figure"},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17858v1","updated":"2024-09-26T14:05:32Z","published":"2024-09-26T14:05:32Z","title":"How Feature Learning Can Improve Neural Scaling Laws","summary":" We develop a solvable model of neural scaling laws beyond the kernel limit.\nTheoretical analysis of this model shows how performance scales with model\nsize, training time, and the total amount of available data. We identify three\nscaling regimes corresponding to varying task difficulties: hard, easy, and\nsuper easy tasks. For easy and super-easy target functions, which lie in the\nreproducing kernel Hilbert space (RKHS) defined by the initial infinite-width\nNeural Tangent Kernel (NTK), the scaling exponents remain unchanged between\nfeature learning and kernel regime models. For hard tasks, defined as those\noutside the RKHS of the initial NTK, we demonstrate both analytically and\nempirically that feature learning can improve scaling with training time and\ncompute, nearly doubling the exponent for hard tasks. This leads to a different\ncompute optimal strategy to scale parameters and training time in the feature\nlearning regime. We support our finding that feature learning improves the\nscaling law for hard tasks but not for easy and super-easy tasks with\nexperiments of nonlinear MLPs fitting functions with power-law Fourier spectra\non the circle and CNNs learning vision tasks.\n","authors":["Blake Bordelon","Alexander Atanasov","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2409.17858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17852v1","updated":"2024-09-26T13:58:06Z","published":"2024-09-26T13:58:06Z","title":"AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein\n Thermodynamics","summary":" All-atom molecular simulations offer detailed insights into macromolecular\nphenomena, but their substantial computational cost hinders the exploration of\ncomplex biological processes. We introduce Advanced Machine-learning Atomic\nRepresentation Omni-force-field (AMARO), a new neural network potential (NNP)\nthat combines an O(3)-equivariant message-passing neural network architecture,\nTensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO\ndemonstrates the feasibility of training coarser NNP, without prior energy\nterms, to run stable protein dynamics with scalability and generalization\ncapabilities.\n","authors":["Antonio Mirarchi","Raul P. Pelaez","Guillem Simeon","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2409.17852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17841v1","updated":"2024-09-26T13:45:36Z","published":"2024-09-26T13:45:36Z","title":"Machine Learning-based vs Deep Learning-based Anomaly Detection in\n Multivariate Time Series for Spacecraft Attitude Sensors","summary":" In the framework of Failure Detection, Isolation and Recovery (FDIR) on\nspacecraft, new AI-based approaches are emerging in the state of the art to\novercome the limitations commonly imposed by traditional threshold checking.\n The present research aims at characterizing two different approaches to the\nproblem of stuck values detection in multivariate time series coming from\nspacecraft attitude sensors. The analysis reveals the performance differences\nin the two approaches, while commenting on their interpretability and\ngeneralization to different scenarios.\n","authors":["R. Gallon","F. Schiemenz","A. Krstova","A. Menicucci","E. Gill"],"pdf_url":"https://arxiv.org/pdf/2409.17841v1.pdf","comment":"Accepted for the ESA SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2409.17836v1","updated":"2024-09-26T13:38:33Z","published":"2024-09-26T13:38:33Z","title":"Language Models as Zero-shot Lossless Gradient Compressors: Towards\n General Neural Parameter Prior Models","summary":" Despite the widespread use of statistical prior models in various fields,\nsuch models for neural network gradients have long been overlooked. The\ninherent challenge stems from their high-dimensional structures and complex\ninterdependencies, which complicate effective modeling. In this work, we\ndemonstrate the potential of large language models (LLMs) to act as gradient\npriors in a zero-shot setting. We examine the property by considering lossless\ngradient compression -- a critical application in distributed learning -- that\ndepends heavily on precise probability modeling. To achieve this, we introduce\nLM-GC, a novel method that integrates LLMs with arithmetic coding. Our\ntechnique converts plain gradients into text-like formats, enhancing token\nefficiency by up to 38 times compared to their plain representations. We ensure\nthat this data conversion maintains a close alignment with the structure of\nplain gradients and the symbols commonly recognized by LLMs. Our experiments\nindicate that LM-GC surpasses existing state-of-the-art lossless compression\nmethods, improving compression rates by 10\\% up to 17.2\\% across various\ndatasets and architectures. Additionally, our approach shows promising\ncompatibility with lossy compression techniques such as quantization and\nsparsification. These findings highlight the significant potential of LLMs as a\nmodel for effectively handling gradients. We will release the source code upon\npublication.\n","authors":["Hui-Po Wang","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.17836v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17833v1","updated":"2024-09-26T13:35:42Z","published":"2024-09-26T13:35:42Z","title":"Ordinary Differential Equations for Enhanced 12-Lead ECG Generation","summary":" In the realm of artificial intelligence, the generation of realistic training\ndata for supervised learning tasks presents a significant challenge. This is\nparticularly true in the synthesis of electrocardiograms (ECGs), where the\nobjective is to develop a synthetic 12-lead ECG model. The primary complexity\nof this task stems from accurately modeling the intricate biological and\nphysiological interactions among different ECG leads. Although mathematical\nprocess simulators have shed light on these dynamics, effectively incorporating\nthis understanding into generative models is not straightforward. In this work,\nwe introduce an innovative method that employs ordinary differential equations\n(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach\nintegrates a system of ODEs that represent cardiac dynamics directly into the\ngenerative model's optimization process, allowing for the production of\nbiologically plausible ECG training data that authentically reflects real-world\nvariability and inter-lead dependencies. We conducted an empirical analysis of\nthousands of ECGs and found that incorporating cardiac simulation insights into\nthe data generation process significantly improves the accuracy of heart\nabnormality classifiers trained on this synthetic 12-lead ECG data.\n","authors":["Yakir Yehuda","Kira Radinsky"],"pdf_url":"https://arxiv.org/pdf/2409.17833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00551v2","updated":"2024-09-26T13:34:41Z","published":"2024-06-01T20:46:40Z","title":"Strategic Linear Contextual Bandits","summary":" Motivated by the phenomenon of strategic agents gaming a recommender system\nto maximize the number of times they are recommended to users, we study a\nstrategic variant of the linear contextual bandit problem, where the arms can\nstrategically misreport privately observed contexts to the learner. We treat\nthe algorithm design problem as one of mechanism design under uncertainty and\npropose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the\nagents (i.e., arms) to report their contexts truthfully while simultaneously\nminimizing regret. We also show that failing to account for the strategic\nnature of the agents results in linear regret. However, a trade-off between\nmechanism design and regret minimization appears to be unavoidable. More\nbroadly, this work aims to provide insight into the intersection of online\nlearning and mechanism design.\n","authors":["Thomas Kleine Buening","Aadirupa Saha","Christos Dimitrakakis","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.00551v2.pdf","comment":"To appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.04769v2","updated":"2024-09-26T13:31:40Z","published":"2024-06-07T09:15:29Z","title":"Diffusion-based Generative Image Outpainting for Recovery of\n FOV-Truncated CT Images","summary":" Field-of-view (FOV) recovery of truncated chest CT scans is crucial for\naccurate body composition analysis, which involves quantifying skeletal muscle\nand subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables\ndisease prognostication. Here, we present a method for recovering truncated CT\nslices using generative image outpainting. We train a diffusion model and apply\nit to truncated CT slices generated by simulating a small FOV. Our model\nreliably recovers the truncated anatomy and outperforms the previous\nstate-of-the-art despite being trained on 87% less data.\n","authors":["Michelle Espranita Liman","Daniel Rueckert","Florian J. Fintelmann","Philip Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04769v2.pdf","comment":"Shared last authorship: Florian J. Fintelmann and Philip M\\\"uller"},{"id":"http://arxiv.org/abs/2409.17113v2","updated":"2024-09-26T13:30:51Z","published":"2024-09-25T17:27:02Z","title":"Characterizing stable regions in the residual stream of LLMs","summary":" We identify \"stable regions\" in the residual stream of Transformers, where\nthe model's output remains insensitive to small activation changes, but\nexhibits high sensitivity at region boundaries. These regions emerge during\ntraining and become more defined as training progresses or model size\nincreases. The regions appear to be much larger than previously studied\npolytopes. Our analysis suggests that these stable regions align with semantic\ndistinctions, where similar prompts cluster within regions, and activations\nfrom the same region lead to similar next token predictions. This work provides\na promising research direction for understanding the complexity of neural\nnetworks, shedding light on training dynamics, and advancing interpretability.\n","authors":["Jett Janiak","Jacek Karwowski","Chatrik Singh Mangat","Giorgi Giglemiani","Nora Petrova","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.17113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14372v2","updated":"2024-09-26T13:23:54Z","published":"2024-05-23T09:48:48Z","title":"Learning Constrained Markov Decision Processes With Non-stationary\n Rewards and Constraints","summary":" In constrained Markov decision processes (CMDPs) with adversarial rewards and\nconstraints, a well-known impossibility result prevents any algorithm from\nattaining both sublinear regret and sublinear constraint violation, when\ncompeting against a best-in-hindsight policy that satisfies constraints on\naverage. In this paper, we show that this negative result can be eased in CMDPs\nwith non-stationary rewards and constraints, by providing algorithms whose\nperformances smoothly degrade as non-stationarity increases. Specifically, we\npropose algorithms attaining $\\tilde{\\mathcal{O}} (\\sqrt{T} + C)$ regret and\npositive constraint violation under bandit feedback, where $C$ is a corruption\nvalue measuring the environment non-stationarity. This can be $\\Theta(T)$ in\nthe worst case, coherently with the impossibility result for adversarial CMDPs.\nFirst, we design an algorithm with the desired guarantees when $C$ is known.\nThen, in the case $C$ is unknown, we show how to obtain the same results by\nembedding such an algorithm in a general meta-procedure. This is of independent\ninterest, as it can be applied to any non-stationary constrained online\nlearning setting.\n","authors":["Francesco Emanuele Stradi","Anna Lunghi","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2405.14372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17825v1","updated":"2024-09-26T13:22:22Z","published":"2024-09-26T13:22:22Z","title":"Physics-aligned Schrödinger bridge","summary":" The reconstruction of physical fields from sparse measurements is pivotal in\nboth scientific research and engineering applications. Traditional methods are\nincreasingly supplemented by deep learning models due to their efficacy in\nextracting features from data. However, except for the low accuracy on complex\nphysical systems, these models often fail to comply with essential physical\nconstraints, such as governing equations and boundary conditions. To overcome\nthis limitation, we introduce a novel data-driven field reconstruction\nframework, termed the Physics-aligned Schr\\\"{o}dinger Bridge (PalSB). This\nframework leverages a diffusion Schr\\\"{o}dinger bridge mechanism that is\nspecifically tailored to align with physical constraints. The PalSB approach\nincorporates a dual-stage training process designed to address both local\nreconstruction mapping and global physical principles. Additionally, a\nboundary-aware sampling technique is implemented to ensure adherence to\nphysical boundary conditions. We demonstrate the effectiveness of PalSB through\nits application to three complex nonlinear systems: cylinder flow from Particle\nImage Velocimetry experiments, two-dimensional turbulence, and a\nreaction-diffusion system. The results reveal that PalSB not only achieves\nhigher accuracy but also exhibits enhanced compliance with physical constraints\ncompared to existing methods. This highlights PalSB's capability to generate\nhigh-quality representations of intricate physical interactions, showcasing its\npotential for advancing field reconstruction techniques.\n","authors":["Zeyu Li","Hongkun Dou","Shen Fang","Wang Han","Yue Deng","Lijun Yang"],"pdf_url":"https://arxiv.org/pdf/2409.17825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17808v1","updated":"2024-09-26T13:02:28Z","published":"2024-09-26T13:02:28Z","title":"Generative Modeling of Molecular Dynamics Trajectories","summary":" Molecular dynamics (MD) is a powerful technique for studying microscopic\nphenomena, but its computational cost has driven significant interest in the\ndevelopment of deep learning-based surrogate models. We introduce generative\nmodeling of molecular trajectories as a paradigm for learning flexible\nmulti-task surrogate models of MD from data. By conditioning on appropriately\nchosen frames of the trajectory, we show such generative models can be adapted\nto diverse tasks such as forward simulation, transition path sampling, and\ntrajectory upsampling. By alternatively conditioning on part of the molecular\nsystem and inpainting the rest, we also demonstrate the first steps towards\ndynamics-conditioned molecular design. We validate the full set of these\ncapabilities on tetrapeptide simulations and show that our model can produce\nreasonable ensembles of protein monomers. Altogether, our work illustrates how\ngenerative modeling can unlock value from MD data towards diverse downstream\ntasks that are not straightforward to address with existing methods or even MD\nitself. Code is available at https://github.com/bjing2016/mdgen.\n","authors":["Bowen Jing","Hannes Stärk","Tommi Jaakkola","Bonnie Berger"],"pdf_url":"https://arxiv.org/pdf/2409.17808v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17806v1","updated":"2024-09-26T12:59:09Z","published":"2024-09-26T12:59:09Z","title":"Continual learning with task specialist","summary":" Continual learning (CL) adapt the deep learning scenarios with timely updated\ndatasets. However, existing CL models suffer from the catastrophic forgetting\nissue, where new knowledge replaces past learning. In this paper, we propose\nContinual Learning with Task Specialists (CLTS) to address the issues of\ncatastrophic forgetting and limited labelled data in real-world datasets by\nperforming class incremental learning of the incoming stream of data. The model\nconsists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained\nStable Diffusion (SD) module. Here, we introduce a new specialist to handle a\nnew task sequence and each T S has three blocks; i) a variational autoencoder\n(V AE) to learn the task distribution in a low dimensional latent space, ii) a\nK-Means block to perform data clustering and iii) Bootstrapping Language-Image\nPre-training (BLIP ) model to generate a small batch of captions from the input\ndata. These captions are fed as input to the pre-trained stable diffusion model\n(SD) for the generation of task samples. The proposed model does not store any\ntask samples for replay, instead uses generated samples from SD to train the T\nP module. A comparison study with four SOTA models conducted on three\nreal-world datasets shows that the proposed model outperforms all the selected\nbaselines\n","authors":["Indu Solomon","Aye Phyu Phyu Aung","Uttam Kumar","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2409.17806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17804v1","updated":"2024-09-26T12:57:47Z","published":"2024-09-26T12:57:47Z","title":"Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging\n Derivatives and Geometric Features","summary":" The positioning of this research falls within the scalar-on-function\nclassification literature, a field of significant interest across various\ndomains, particularly in statistics, mathematics, and computer science. This\nstudy introduces an advanced methodology for supervised classification by\nintegrating Functional Data Analysis (FDA) with tree-based ensemble techniques\nfor classifying high-dimensional time series. The proposed framework, Enriched\nFunctional Tree-Based Classifiers (EFTCs), leverages derivative and geometric\nfeatures, benefiting from the diversity inherent in ensemble methods to further\nenhance predictive performance and reduce variance. While our approach has been\ntested on the enrichment of Functional Classification Trees (FCTs), Functional\nK-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and\nFunctional LightGBM (FLGBM), it could be extended to other tree-based and\nnon-tree-based classifiers, with appropriate considerations emerging from this\ninvestigation. Through extensive experimental evaluations on seven real-world\ndatasets and six simulated scenarios, this proposal demonstrates fascinating\nimprovements over traditional approaches, providing new insights into the\napplication of FDA in complex, high-dimensional learning problems.\n","authors":["Fabrizio Maturo","Annamaria Porreca"],"pdf_url":"https://arxiv.org/pdf/2409.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10615v2","updated":"2024-09-26T12:55:43Z","published":"2024-06-15T12:27:35Z","title":"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation","summary":" Given the high cost of collecting robotic data in the real world, sample\nefficiency is a consistently compelling pursuit in robotics. In this paper, we\nintroduce SGRv2, an imitation learning framework that enhances sample\nefficiency through improved visual and action representations. Central to the\ndesign of SGRv2 is the incorporation of a critical inductive bias-action\nlocality, which posits that robot's actions are predominantly influenced by the\ntarget object and its interactions with the local environment. Extensive\nexperiments in both simulated and real-world settings demonstrate that action\nlocality is essential for boosting sample efficiency. SGRv2 excels in RLBench\ntasks with keyframe control using merely 5 demonstrations and surpasses the RVT\nbaseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and\nMimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR.\nIn real-world environments, with only eight demonstrations, SGRv2 can perform a\nvariety of tasks at a markedly higher success rate compared to baseline models.\nProject website: http://sgrv2-robot.github.io\n","authors":["Tong Zhang","Yingdong Hu","Jiacheng You","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10615v2.pdf","comment":"CoRL 2024. Project website: http://sgrv2-robot.github.io"},{"id":"http://arxiv.org/abs/2409.17790v1","updated":"2024-09-26T12:37:22Z","published":"2024-09-26T12:37:22Z","title":"CASPFormer: Trajectory Prediction from BEV Images with Deformable\n Attention","summary":" Motion prediction is an important aspect for Autonomous Driving (AD) and\nAdvance Driver Assistance Systems (ADAS). Current state-of-the-art motion\nprediction methods rely on High Definition (HD) maps for capturing the\nsurrounding context of the ego vehicle. Such systems lack scalability in\nreal-world deployment as HD maps are expensive to produce and update in\nreal-time. To overcome this issue, we propose Context Aware Scene Prediction\nTransformer (CASPFormer), which can perform multi-modal motion prediction from\nrasterized Bird-Eye-View (BEV) images. Our system can be integrated with any\nupstream perception module that is capable of generating BEV images. Moreover,\nCASPFormer directly decodes vectorized trajectories without any postprocessing.\nTrajectories are decoded recurrently using deformable attention, as it is\ncomputationally efficient and provides the network with the ability to focus\nits attention on the important spatial locations of the BEV images. In\naddition, we also address the issue of mode collapse for generating multiple\nscene-consistent trajectories by incorporating learnable mode queries. We\nevaluate our model on the nuScenes dataset and show that it reaches\nstate-of-the-art across multiple metrics\n","authors":["Harsh Yadav","Maximilian Schaefer","Kun Zhao","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.17790v1.pdf","comment":"Under Review at ICPR 2024, Kolkata"},{"id":"http://arxiv.org/abs/2409.06364v2","updated":"2024-09-26T12:33:46Z","published":"2024-09-10T09:42:58Z","title":"What happens to diffusion model likelihood when your model is\n conditional?","summary":" Diffusion Models (DMs) iteratively denoise random samples to produce\nhigh-quality data. The iterative sampling process is derived from Stochastic\nDifferential Equations (SDEs), allowing a speed-quality trade-off chosen at\ninference. Another advantage of sampling with differential equations is exact\nlikelihood computation. These likelihoods have been used to rank unconditional\nDMs and for out-of-domain classification. Despite the many existing and\npossible uses of DM likelihoods, the distinct properties captured are unknown,\nespecially in conditional contexts such as Text-To-Image (TTI) or\nText-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods\nare agnostic to the text input. TTI likelihood is more expressive but cannot\ndiscern confounding prompts. Our results show that applying DMs to conditional\ntasks reveals inconsistencies and strengthens claims that the properties of DM\nlikelihood are unknown. This impact sheds light on the previously unknown\nnature of DM likelihoods. Although conditional DMs maximise likelihood, the\nlikelihood in question is not as sensitive to the conditioning input as one\nexpects. This investigation provides a new point-of-view on diffusion\nlikelihoods.\n","authors":["Mattias Cross","Anton Ragni"],"pdf_url":"https://arxiv.org/pdf/2409.06364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14590v2","updated":"2024-09-26T12:29:45Z","published":"2024-09-22T20:47:04Z","title":"Explainable AI needs formal notions of explanation correctness","summary":" The use of machine learning (ML) in critical domains such as medicine poses\nrisks and requires regulation. One requirement is that decisions of ML systems\nin high-risk applications should be human-understandable. The field of\n\"explainable artificial intelligence\" (XAI) seemingly addresses this need.\nHowever, in its current form, XAI is unfit to provide quality control for ML;\nit itself needs scrutiny. Popular XAI methods cannot reliably answer important\nquestions about ML models, their training data, or a given test input. We\nrecapitulate results demonstrating that popular XAI methods systematically\nattribute importance to input features that are independent of the prediction\ntarget. This limits their utility for purposes such as model and data\n(in)validation, model improvement, and scientific discovery. We argue that the\nfundamental reason for this limitation is that current XAI methods do not\naddress well-defined problems and are not evaluated against objective criteria\nof explanation correctness. Researchers should formally define the problems\nthey intend to solve first and then design methods accordingly. This will lead\nto notions of explanation correctness that can be theoretically verified and\nobjective metrics of explanation performance that can be assessed using\nground-truth data.\n","authors":["Stefan Haufe","Rick Wilming","Benedict Clark","Rustam Zhumagambetov","Danny Panknin","Ahcène Boubekki"],"pdf_url":"https://arxiv.org/pdf/2409.14590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17786v1","updated":"2024-09-26T12:29:13Z","published":"2024-09-26T12:29:13Z","title":"Predicting the Stay Length of Patients in Hospitals using Convolutional\n Gated Recurrent Deep Learning Model","summary":" Predicting hospital length of stay (LoS) stands as a critical factor in\nshaping public health strategies. This data serves as a cornerstone for\ngovernments to discern trends, patterns, and avenues for enhancing healthcare\ndelivery. In this study, we introduce a robust hybrid deep learning model, a\ncombination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent\nUnits (GRU), and Dense neural networks, that outperforms 11 conventional and\nstate-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in\naccurately forecasting inpatient hospital stay duration. Our investigation\ndelves into the implementation of this hybrid model, scrutinising variables\nlike geographic indicators tied to caregiving institutions, demographic markers\nencompassing patient ethnicity, race, and age, as well as medical attributes\nsuch as the CCS diagnosis code, APR DRG code, illness severity metrics, and\nhospital stay duration. Statistical evaluations reveal the pinnacle LoS\naccuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89%\nacross a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and\nConvolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%,\nrespectively. Accurate LoS predictions not only empower hospitals to optimise\nresource allocation and curb expenses associated with prolonged stays but also\npave the way for novel strategies in hospital stay management. This avenue\nholds promise for catalysing advancements in healthcare research and\ninnovation, inspiring a new era of precision-driven healthcare practices.\n","authors":["Mehdi Neshat","Michael Phipps","Chris A. Browne","Nicole T. Vargas","Seyedali Mirjalili"],"pdf_url":"https://arxiv.org/pdf/2409.17786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08757v4","updated":"2024-09-26T12:17:24Z","published":"2024-03-13T17:55:34Z","title":"Efficient Combinatorial Optimization via Heat Diffusion","summary":" Combinatorial optimization problems are widespread but inherently challenging\ndue to their discrete nature. The primary limitation of existing methods is\nthat they can only access a small fraction of the solution space at each\niteration, resulting in limited efficiency for searching the global optimal. To\novercome this challenge, diverging from conventional efforts of expanding the\nsolver's search scope, we focus on enabling information to actively propagate\nto the solver through heat diffusion. By transforming the target function while\npreserving its optima, heat diffusion facilitates information flow from distant\nregions to the solver, providing more efficient navigation. Utilizing heat\ndiffusion, we propose a framework for solving general combinatorial\noptimization problems. The proposed methodology demonstrates superior\nperformance across a range of the most challenging and widely encountered\ncombinatorial optimizations. Echoing recent advancements in harnessing\nthermodynamics for generative artificial intelligence, our study further\nreveals its significant potential in advancing combinatorial optimization.\n","authors":["Hengyuan Ma","Wenlian Lu","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2403.08757v4.pdf","comment":"After the rebuttal version for NeurIPS 2024 (poster). Code is\n available in https://github.com/AwakerMhy/HeO"},{"id":"http://arxiv.org/abs/2309.16928v3","updated":"2024-09-26T12:09:22Z","published":"2023-09-29T02:04:24Z","title":"Learning to Receive Help: Intervention-Aware Concept Embedding Models","summary":" Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures\nby constructing and explaining their predictions using a set of high-level\nconcepts. A special property of these models is that they permit concept\ninterventions, wherein users can correct mispredicted concepts and thus improve\nthe model's performance. Recent work, however, has shown that intervention\nefficacy can be highly dependent on the order in which concepts are intervened\non and on the model's architecture and training hyperparameters. We argue that\nthis is rooted in a CBM's lack of train-time incentives for the model to be\nappropriately receptive to concept interventions. To address this, we propose\nIntervention-aware Concept Embedding models (IntCEMs), a novel CBM-based\narchitecture and training paradigm that improves a model's receptiveness to\ntest-time interventions. Our model learns a concept intervention policy in an\nend-to-end fashion from where it can sample meaningful intervention\ntrajectories at train-time. This conditions IntCEMs to effectively select and\nreceive concept interventions when deployed at test-time. Our experiments show\nthat IntCEMs significantly outperform state-of-the-art concept-interpretable\nmodels when provided with test-time concept interventions, demonstrating the\neffectiveness of our approach.\n","authors":["Mateo Espinosa Zarlenga","Katherine M. Collins","Krishnamurthy Dvijotham","Adrian Weller","Zohreh Shams","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2309.16928v3.pdf","comment":"Accepted as a spotlight at the Thirty-seventh Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2409.17763v1","updated":"2024-09-26T11:58:41Z","published":"2024-09-26T11:58:41Z","title":"Confidence intervals uncovered: Are we ready for real-world medical\n imaging AI?","summary":" Medical imaging is spearheading the AI transformation of healthcare.\nPerformance reporting is key to determine which methods should be translated\ninto clinical practice. Frequently, broad conclusions are simply derived from\nmean performance values. In this paper, we argue that this common practice is\noften a misleading simplification as it ignores performance variability. Our\ncontribution is threefold. (1) Analyzing all MICCAI segmentation papers (n =\n221) published in 2023, we first observe that more than 50\\% of papers do not\nassess performance variability at all. Moreover, only one (0.5\\%) paper\nreported confidence intervals (CIs) for model performance. (2) To address the\nreporting bottleneck, we show that the unreported standard deviation (SD) in\nsegmentation papers can be approximated by a second-order polynomial function\nof the mean Dice similarity coefficient (DSC). Based on external validation\ndata from 56 previous MICCAI challenges, we demonstrate that this approximation\ncan accurately reconstruct the CI of a method using information provided in\npublications. (3) Finally, we reconstructed 95\\% CIs around the mean DSC of\nMICCAI 2023 segmentation papers. The median CI width was 0.03 which is three\ntimes larger than the median performance gap between the first and second\nranked method. For more than 60\\% of papers, the mean performance of the\nsecond-ranked method was within the CI of the first-ranked method. We conclude\nthat current publications typically do not provide sufficient evidence to\nsupport which models could potentially be translated into clinical practice.\n","authors":["Evangelia Christodoulou","Annika Reinke","Rola Houhou","Piotr Kalinowski","Selen Erkan","Carole H. Sudre","Ninon Burgos","Sofiène Boutaj","Sophie Loizillon","Maëlys Solal","Nicola Rieke","Veronika Cheplygina","Michela Antonelli","Leon D. Mayer","Minu D. Tizabi","M. Jorge Cardoso","Amber Simpson","Paul F. Jäger","Annette Kopp-Schneider","Gaël Varoquaux","Olivier Colliot","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2409.17763v1.pdf","comment":"Paper accepted at MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2408.10672v2","updated":"2024-09-26T11:42:31Z","published":"2024-08-20T09:17:11Z","title":"Neural Exploratory Landscape Analysis","summary":" Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that\nmeta-trained neural networks can effectively guide the design of black-box\noptimizers, significantly reducing the need for expert tuning and delivering\nrobust performance across complex problem distributions. Despite their success,\na paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape\nAnalysis features to inform the meta-level agent about the low-level\noptimization progress. To address the gap, this paper proposes Neural\nExploratory Landscape Analysis (NeurELA), a novel framework that dynamically\nprofiles landscape features through a two-stage, attention-based neural\nnetwork, executed in an entirely end-to-end fashion. NeurELA is pre-trained\nover a variety of MetaBBO algorithms using a multi-task neuroevolution\nstrategy. Extensive experiments show that NeurELA achieves consistently\nsuperior performance when integrated into different and even unseen MetaBBO\ntasks and can be efficiently fine-tuned for further performance boost. This\nadvancement marks a pivotal step in making MetaBBO algorithms more autonomous\nand broadly applicable.The source code of NeurELA can be accessed at\nhttps://anonymous.4open.science/r/Neur-ELA-303C.\n","authors":["Zeyuan Ma","Jiacheng Chen","Hongshu Guo","Yue-Jiao Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.07495v5","updated":"2024-09-26T11:42:25Z","published":"2020-01-21T13:05:31Z","title":"Unsupervisedly Learned Representations: Should the Quest be Over?","summary":" After four decades of research there still exists a Classification accuracy\ngap of about 20% between our best Unsupervisedly Learned Representations\nmethods and the accuracy rates achieved by intelligent animals. It thus may\nwell be that we are looking in the wrong direction. A possible solution to this\npuzzle is presented. We demonstrate that Reinforcement Learning can learn\nrepresentations which achieve the same accuracy as that of animals. Our main\nmodest contribution lies in the observations that: a. when applied to a real\nworld environment Reinforcement Learning does not require labels, and thus may\nbe legitimately considered as Unsupervised Learning, and b. in contrast, when\nReinforcement Learning is applied in a simulated environment it does inherently\nrequire labels and should thus be generally be considered as Supervised\nLearning. The corollary of these observations is that further search for\nUnsupervised Learning competitive paradigms which may be trained in simulated\nenvironments may be futile.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2001.07495v5.pdf","comment":"To be published at The 6th International Conference on Machine\n Learning, Optimization and Data Science - LOD 2020"},{"id":"http://arxiv.org/abs/2409.17754v1","updated":"2024-09-26T11:36:08Z","published":"2024-09-26T11:36:08Z","title":"Byzantine-Robust Aggregation for Securing Decentralized Federated\n Learning","summary":" Federated Learning (FL) emerges as a distributed machine learning approach\nthat addresses privacy concerns by training AI models locally on devices.\nDecentralized Federated Learning (DFL) extends the FL paradigm by eliminating\nthe central server, thereby enhancing scalability and robustness through the\navoidance of a single point of failure. However, DFL faces significant\nchallenges in optimizing security, as most Byzantine-robust algorithms proposed\nin the literature are designed for centralized scenarios. In this paper, we\npresent a novel Byzantine-robust aggregation algorithm to enhance the security\nof Decentralized Federated Learning environments, coined WFAgg. This proposal\nhandles the adverse conditions and strength robustness of dynamic decentralized\ntopologies at the same time by employing multiple filters to identify and\nmitigate Byzantine attacks. Experimental results demonstrate the effectiveness\nof the proposed algorithm in maintaining model accuracy and convergence in the\npresence of various Byzantine attack scenarios, outperforming state-of-the-art\ncentralized Byzantine-robust aggregation schemes (such as Multi-Krum or\nClustering). These algorithms are evaluated on an IID image classification\nproblem in both centralized and decentralized scenarios.\n","authors":["Diego Cajaraville-Aboy","Ana Fernández-Vilas","Rebeca P. Díaz-Redondo","Manuel Fernández-Veiga"],"pdf_url":"https://arxiv.org/pdf/2409.17754v1.pdf","comment":"18 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.17745v1","updated":"2024-09-26T11:19:09Z","published":"2024-09-26T11:19:09Z","title":"Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval\n Model","summary":" A supervised ranking model, despite its advantage of being effective, usually\ninvolves complex processing - typically multiple stages of task-specific\npre-training and fine-tuning. This has motivated researchers to explore simpler\npipelines leveraging large language models (LLMs) that are capable of working\nin a zero-shot manner. However, since zero-shot inference does not make use of\na training set of pairs of queries and their relevant documents, its\nperformance is mostly worse than that of supervised models, which are trained\non such example pairs. Motivated by the existing findings that training\nexamples generally improve zero-shot performance, in our work, we explore if\nthis also applies to ranking models. More specifically, given a query and a\npair of documents, the preference prediction task is improved by augmenting\nexamples of preferences for similar queries from a training set. Our proposed\npairwise few-shot ranker demonstrates consistent improvements over the\nzero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset)\nretrieval benchmarks. Our method also achieves a close performance to that of a\nsupervised model without requiring any complex training pipeline.\n","authors":["Nilanjan Sinhababu","Andrew Parry","Debasis Ganguly","Debasis Samanta","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.17745v1.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.17730v1","updated":"2024-09-26T11:00:19Z","published":"2024-09-26T11:00:19Z","title":"Autoregressive Generation Strategies for Top-K Sequential\n Recommendations","summary":" The goal of modern sequential recommender systems is often formulated in\nterms of next-item prediction. In this paper, we explore the applicability of\ngenerative transformer-based models for the Top-K sequential recommendation\ntask, where the goal is to predict items a user is likely to interact with in\nthe \"near future\".\n We explore commonly used autoregressive generation strategies, including\ngreedy decoding, beam search, and temperature sampling, to evaluate their\nperformance for the Top-K sequential recommendation task. In addition, we\npropose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA)\ngeneration strategies based on multi-sequence generation with temperature\nsampling and subsequent aggregation.\n Experiments on diverse datasets give valuable insights regarding commonly\nused strategies' applicability and show that suggested approaches improve\nperformance on longer time horizons compared to widely-used Top-K prediction\napproach and single-sequence autoregressive generation strategies.\n","authors":["Anna Volodkevich","Danil Gusak","Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2409.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17726v1","updated":"2024-09-26T10:56:27Z","published":"2024-09-26T10:56:27Z","title":"Recent advances in interpretable machine learning using structure-based\n protein representations","summary":" Recent advancements in machine learning (ML) are transforming the field of\nstructural biology. For example, AlphaFold, a groundbreaking neural network for\nprotein structure prediction, has been widely adopted by researchers. The\navailability of easy-to-use interfaces and interpretable outcomes from the\nneural network architecture, such as the confidence scores used to color the\npredicted structures, have made AlphaFold accessible even to non-ML experts. In\nthis paper, we present various methods for representing protein 3D structures\nfrom low- to high-resolution, and show how interpretable ML methods can support\ntasks such as predicting protein structures, protein function, and\nprotein-protein interactions. This survey also emphasizes the significance of\ninterpreting and visualizing ML-based inference for structure-based protein\nrepresentations that enhance interpretability and knowledge discovery.\nDeveloping such interpretable approaches promises to further accelerate fields\nincluding drug development and protein design.\n","authors":["Luiz Felipe Vecchietti","Minji Lee","Begench Hangeldiyev","Hyunkyu Jung","Hahnbeom Park","Tae-Kyun Kim","Meeyoung Cha","Ho Min Kim"],"pdf_url":"https://arxiv.org/pdf/2409.17726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17716v1","updated":"2024-09-26T10:38:35Z","published":"2024-09-26T10:38:35Z","title":"QuForge: A Library for Qudits Simulation","summary":" Quantum computing with qudits, an extension of qubits to multiple levels, is\na research field less mature than qubit-based quantum computing. However,\nqudits can offer some advantages over qubits, by representing information with\nfewer separated components. In this article, we present QuForge, a Python-based\nlibrary designed to simulate quantum circuits with qudits. This library\nprovides the necessary quantum gates for implementing quantum algorithms,\ntailored to any chosen qudit dimension. Built on top of differentiable\nframeworks, QuForge supports execution on accelerating devices such as GPUs and\nTPUs, significantly speeding up simulations. It also supports sparse\noperations, leading to a reduction in memory consumption compared to other\nlibraries. Additionally, by constructing quantum circuits as differentiable\ngraphs, QuForge facilitates the implementation of quantum machine learning\nalgorithms, enhancing the capabilities and flexibility of quantum computing\nresearch.\n","authors":["Tiago de Souza Farias","Lucas Friedrich","Jonas Maziero"],"pdf_url":"https://arxiv.org/pdf/2409.17716v1.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.17711v1","updated":"2024-09-26T10:27:19Z","published":"2024-09-26T10:27:19Z","title":"Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation","summary":" News recommendation is a challenging task that involves personalization based\non the interaction history and preferences of each user. Recent works have\nleveraged the power of pretrained language models (PLMs) to directly rank news\nitems by using inference approaches that predominately fall into three\ncategories: pointwise, pairwise, and listwise learning-to-rank. While pointwise\nmethods offer linear inference complexity, they fail to capture crucial\ncomparative information between items that is more effective for ranking tasks.\nConversely, pairwise and listwise approaches excel at incorporating these\ncomparisons but suffer from practical limitations: pairwise approaches are\neither computationally expensive or lack theoretical guarantees, and listwise\nmethods often perform poorly in practice. In this paper, we propose a novel\nframework for PLM-based news recommendation that integrates both pointwise\nrelevance prediction and pairwise comparisons in a scalable manner. We present\na rigorous theoretical analysis of our framework, establishing conditions under\nwhich our approach guarantees improved performance. Extensive experiments show\nthat our approach outperforms the state-of-the-art methods on the MIND and\nAdressa news recommendation datasets.\n","authors":["Nithish Kannen","Yao Ma","Gerrit J. J. van den Burg","Jean Baptiste Faddoul"],"pdf_url":"https://arxiv.org/pdf/2409.17711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15600v2","updated":"2024-09-26T10:26:18Z","published":"2024-08-28T07:48:39Z","title":"Exploring Selective Layer Fine-Tuning in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for fine-tuning\nfoundation models using distributed data in a privacy-preserving manner. Under\nlimited computational resources, clients often find it more practical to\nfine-tune a selected subset of layers, rather than the entire model, based on\ntheir task-specific data. In this study, we provide a thorough theoretical\nexploration of selective layer fine-tuning in FL, emphasizing a flexible\napproach that allows the clients to adjust their selected layers according to\ntheir local data and resources. We theoretically demonstrate that the layer\nselection strategy has a significant impact on model convergence in two\ncritical aspects: the importance of selected layers and the heterogeneous\nchoices across clients. Drawing from these insights, we further propose a\nstrategic layer selection method that utilizes local gradients and regulates\nlayer selections across clients. The extensive experiments on both image and\ntext datasets demonstrate the effectiveness of the proposed strategy compared\nwith several baselines, highlighting its advances in identifying critical\nlayers that adapt to the client heterogeneity and training dynamics in FL.\n","authors":["Yuchang Sun","Yuexiang Xie","Bolin Ding","Yaliang Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v2","updated":"2024-09-26T10:22:34Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing sequences of interactions between users and items, sequential\nrecommendation models can learn user intent and make predictions about the next\nitem. Next to item interactions, most systems also have interactions with what\nwe call non-item pages: these pages are not related to specific items but still\ncan provide insights of the user's interests, as, for example, navigation\npages.\n We therefore propose a general way to include these non-item pages in\nsequential recommendation models to enhance next-item prediction. First, we\ndemonstrate the influence of non-item pages on following interactions with the\nhypotheses testing framework HypTrails and propose methods for representing\nnon-item pages in sequential recommendation models. Subsequently, we adapt\npopular sequential recommender models to integrate non-item pages and\ninvestigate their performance with different item representation strategies as\nwell as their ability to handle noisy data. To show the general capabilities of\nthe models to integrate non-item pages, we create a synthetic dataset for a\ncontrolled setting and then evaluate the improvements from including non-item\npages on two real-world datasets.\n Our results show that non-item pages are a valuable source of information,\nand incorporating them in sequential recommendation models increases the\nperformance of next-item prediction across all analyzed model architectures.\n","authors":["Elisabeth Fischer","Albin Zehe","Andreas Hotho","Daniel Schlör"],"pdf_url":"https://arxiv.org/pdf/2408.15953v2.pdf","comment":"37 pages, 19 figures; Submitted to ACM TORS"},{"id":"http://arxiv.org/abs/2407.14788v2","updated":"2024-09-26T10:21:33Z","published":"2024-07-20T07:39:07Z","title":"On the Design and Analysis of LLM-Based Algorithms","summary":" We initiate a formal investigation into the design and analysis of LLM-based\nalgorithms, i.e. algorithms that contain one or multiple calls of large\nlanguage models (LLMs) as sub-routines and critically rely on the capabilities\nof LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt\nengineering to complicated LLM-powered agent systems and compound AI systems,\nhave achieved remarkable empirical success, the design and optimization of them\nhave mostly relied on heuristics and trial-and-errors, which is largely due to\na lack of formal and analytical study for these algorithms. To fill this gap,\nwe start by identifying the computational-graph representation of LLM-based\nalgorithms, the design principle of task decomposition, and some key\nabstractions, which then facilitate our formal analysis for the accuracy and\nefficiency of LLM-based algorithms, despite the black-box nature of LLMs.\nThrough extensive analytical and empirical investigation in a series of case\nstudies, we demonstrate that the proposed framework is broadly applicable to a\nwide range of scenarios and diverse patterns of LLM-based algorithms, such as\nparallel, hierarchical and recursive task decomposition. Our proposed framework\nholds promise for advancing LLM-based algorithms, by revealing the reasons\nbehind curious empirical phenomena, guiding the choices of hyperparameters,\npredicting the empirical performance of algorithms, and inspiring new algorithm\ndesign. To promote further study of LLM-based algorithms, we release our source\ncode at\nhttps://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm.\n","authors":["Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.14788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17704v1","updated":"2024-09-26T10:20:59Z","published":"2024-09-26T10:20:59Z","title":"Transfer Learning in $\\ell_1$ Regularized Regression: Hyperparameter\n Selection Strategy based on Sharp Asymptotic Analysis","summary":" Transfer learning techniques aim to leverage information from multiple\nrelated datasets to enhance prediction quality against a target dataset. Such\nmethods have been adopted in the context of high-dimensional sparse regression,\nand some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining\nLasso are such examples. These algorithms require the statistician to select\nhyperparameters that control the extent and type of information transfer from\nrelated datasets. However, selection strategies for these hyperparameters, as\nwell as the impact of these choices on the algorithm's performance, have been\nlargely unexplored. To address this, we conduct a thorough, precise study of\nthe algorithm in a high-dimensional setting via an asymptotic analysis using\nthe replica method. Our approach reveals a surprisingly simple behavior of the\nalgorithm: Ignoring one of the two types of information transferred to the\nfine-tuning stage has little effect on generalization performance, implying\nthat efforts for hyperparameter selection can be significantly reduced. Our\ntheoretical findings are also empirically supported by real-world applications\non the IMDb dataset.\n","authors":["Koki Okajima","Tomoyuki Obuchi"],"pdf_url":"https://arxiv.org/pdf/2409.17704v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.17703v1","updated":"2024-09-26T10:20:25Z","published":"2024-09-26T10:20:25Z","title":"PGN: The RNN's New Successor is Effective for Long-Range Time Series\n Forecasting","summary":" Due to the recurrent structure of RNN, the long information propagation path\nposes limitations in capturing long-term dependencies, gradient\nexplosion/vanishing issues, and inefficient sequential execution. Based on\nthis, we propose a novel paradigm called Parallel Gated Network (PGN) as the\nnew successor to RNN. PGN directly captures information from previous time\nsteps through the designed Historical Information Extraction (HIE) layer and\nleverages gated mechanisms to select and fuse it with the current time step\ninformation. This reduces the information propagation path to $\\mathcal{O}(1)$,\neffectively addressing the limitations of RNN. To enhance PGN's performance in\nlong-range time series forecasting tasks, we propose a novel temporal modeling\nframework called Temporal PGN (TPGN). TPGN incorporates two branches to\ncomprehensively capture the semantic information of time series. One branch\nutilizes PGN to capture long-term periodic patterns while preserving their\nlocal characteristics. The other branch employs patches to capture short-term\ninformation and aggregate the global representation of the series. TPGN\nachieves a theoretical complexity of $\\mathcal{O}(\\sqrt{L})$, ensuring\nefficiency in its operations. Experimental results on five benchmark datasets\ndemonstrate the state-of-the-art (SOTA) performance and high efficiency of\nTPGN, further confirming the effectiveness of PGN as the new successor to RNN\nin long-range time series forecasting. The code is available in this\nrepository: \\url{https://github.com/Water2sea/TPGN}.\n","authors":["Yuxin Jia","Youfang Lin","Jing Yu","Shuo Wang","Tianhao Liu","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2409.17703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17699v1","updated":"2024-09-26T10:12:19Z","published":"2024-09-26T10:12:19Z","title":"MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard\n for Prompt Attacks","summary":" The proliferation of Large Language Models (LLMs) in diverse applications\nunderscores the pressing need for robust security measures to thwart potential\njailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger\ndata integrity and user privacy. Guardrails serve as crucial protective\nmechanisms against such threats, but existing models often fall short in terms\nof both detection accuracy, and computational efficiency. This paper advocates\nfor the significance of jailbreak attack prevention on LLMs, and emphasises the\nrole of input guardrails in safeguarding these models. We introduce MoJE\n(Mixture of Jailbreak Expert), a novel guardrail architecture designed to\nsurpass current limitations in existing state-of-the-art guardrails. By\nemploying simple linguistic statistical techniques, MoJE excels in detecting\njailbreak attacks while maintaining minimal computational overhead during model\ninference. Through rigorous experimentation, MoJE demonstrates superior\nperformance capable of detecting 90% of the attacks without compromising benign\nprompts, enhancing LLMs security against jailbreak attacks.\n","authors":["Giandomenico Cornacchia","Giulio Zizzo","Kieran Fraser","Muhammad Zaid Hamed","Ambrish Rawat","Mark Purcell"],"pdf_url":"https://arxiv.org/pdf/2409.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17163v2","updated":"2024-09-26T10:07:06Z","published":"2024-07-24T11:07:20Z","title":"dlordinal: a Python package for deep ordinal classification","summary":" dlordinal is a new Python library that unifies many recent deep ordinal\nclassification methodologies available in the literature. Developed using\nPyTorch as underlying framework, it implements the top performing\nstate-of-the-art deep learning techniques for ordinal classification problems.\nOrdinal approaches are designed to leverage the ordering information present in\nthe target variable. Specifically, it includes loss functions, various output\nlayers, dropout techniques, soft labelling methodologies, and other\nclassification strategies, all of which are appropriately designed to\nincorporate the ordinal information. Furthermore, as the performance metrics to\nassess novel proposals in ordinal classification depend on the distance between\ntarget and predicted classes in the ordinal scale, suitable ordinal evaluation\nmetrics are also included. dlordinal is distributed under the BSD-3-Clause\nlicense and is available at https://github.com/ayrna/dlordinal.\n","authors":["Francisco Bérchez-Moreno","Víctor M. Vargas","Rafael Ayllón-Gavilán","David Guijo-Rubio","César Hervás-Martínez","Juan C. Fernández","Pedro A. Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2407.17163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17692v1","updated":"2024-09-26T09:57:16Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":" In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v1.pdf","comment":"Technical Report. Codes and models will be available soon"},{"id":"http://arxiv.org/abs/2409.17691v1","updated":"2024-09-26T09:56:13Z","published":"2024-09-26T09:56:13Z","title":"Efficient Bias Mitigation Without Privileged Information","summary":" Deep neural networks trained via empirical risk minimisation often exhibit\nsignificant performance disparities across groups, particularly when group and\ntask labels are spuriously correlated (e.g., \"grassy background\" and \"cows\").\nExisting bias mitigation methods that aim to address this issue often either\nrely on group labels for training or validation, or require an extensive\nhyperparameter search. Such data and computational requirements hinder the\npractical deployment of these methods, especially when datasets are too large\nto be group-annotated, computational resources are limited, and models are\ntrained through already complex pipelines. In this paper, we propose Targeted\nAugmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework\nthat leverages the entire training history of a helper model to identify\nspurious samples, and generate a group-balanced training set from which a\nrobust model can be trained. We show that TAB improves worst-group performance\nwithout any group information or model selection, outperforming existing\nmethods while maintaining overall accuracy.\n","authors":["Mateo Espinosa Zarlenga","Swami Sankaranarayanan","Jerone T. A. Andrews","Zohreh Shams","Mateja Jamnik","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.17691v1.pdf","comment":"Accepted at the 18th European Conference on Computer Vision (ECCV\n 2024) as an Oral presentation"},{"id":"http://arxiv.org/abs/2312.05181v3","updated":"2024-09-26T09:52:13Z","published":"2023-12-08T17:08:03Z","title":"Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable\n Tensor Collections","summary":" Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining\ndata, model, and pipeline parallelism, to use large GPU clusters efficiently.\nLong-running jobs may experience changes to their GPU allocation: (i) resource\nelasticity during training adds or removes GPUs; (ii) hardware maintenance may\nrequire redeployment on different GPUs; and (iii) GPU failures force jobs to\nrun with fewer devices. Current DL frameworks tie jobs to a set of GPUs and\nthus lack support for these scenarios. In particular, they cannot change the\nmulti-dimensional parallelism of an already-running job in an efficient and\nmodel-independent way.\n We describe Scalai, a state management library for DL systems that enables\njobs to change their parallelism dynamically after the GPU allocation is\nupdated at runtime. Scalai achieves this through a new abstraction, a\nparallelizable tensor collection (PTC), that externalizes the job state during\ntraining. After a GPU change, Scalai uses the PTC to transform the job state:\nthe PTC repartitions the dataset state under data parallelism and exposes it to\nDL workers through a virtual file system; and the PTC obtains the model state\nas partitioned checkpoints and transforms them to reflect the new\nparallelization configuration. For efficiency, Scalai executes PTC\ntransformations in parallel with minimum data movement between workers. Our\nexperiments show that Scalai enables DL jobs to support dynamic parallelization\nwith low overhead.\n","authors":["Marcel Wagenländer","Guo Li","Bo Zhao","Luo Mai","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2312.05181v3.pdf","comment":"The 30th Symposium on Operating Systems Principles (SOSP24)"},{"id":"http://arxiv.org/abs/2409.17687v1","updated":"2024-09-26T09:51:29Z","published":"2024-09-26T09:51:29Z","title":"Graph Edit Distance with General Costs Using Neural Set Divergence","summary":" Graph Edit Distance (GED) measures the (dis-)similarity between two given\ngraphs, in terms of the minimum-cost edit sequence that transforms one graph to\nthe other. However, the exact computation of GED is NP-Hard, which has recently\nmotivated the design of neural methods for GED estimation. However, they do not\nexplicitly account for edit operations with different costs. In response, we\npropose GRAPHEDX, a neural GED estimator that can work with general costs\nspecified for the four edit operations, viz., edge deletion, edge addition,\nnode deletion and node addition. We first present GED as a quadratic assignment\nproblem (QAP) that incorporates these four costs. Then, we represent each graph\nas a set of node and edge embeddings and use them to design a family of neural\nset divergence surrogates. We replace the QAP terms corresponding to each\noperation with their surrogates. Computing such neural set divergence require\naligning nodes and edges of the two graphs. We learn these alignments using a\nGumbel-Sinkhorn permutation generator, additionally ensuring that the node and\nedge alignments are consistent with each other. Moreover, these alignments are\ncognizant of both the presence and absence of edges between node-pairs.\nExperiments on several datasets, under a variety of edit cost settings, show\nthat GRAPHEDX consistently outperforms state-of-the-art methods and heuristics\nin terms of prediction error.\n","authors":["Eeshaan Jain","Indradyumna Roy","Saswat Meher","Soumen Chakrabarti","Abir De"],"pdf_url":"https://arxiv.org/pdf/2409.17687v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17685v1","updated":"2024-09-26T09:51:08Z","published":"2024-09-26T09:51:08Z","title":"Artificial Data Point Generation in Clustered Latent Space for Small\n Medical Datasets","summary":" One of the growing trends in machine learning is the use of data generation\ntechniques, since the performance of machine learning models is dependent on\nthe quantity of the training dataset. However, in many medical applications,\ncollecting large datasets is challenging due to resource constraints, which\nleads to overfitting and poor generalization. This paper introduces a novel\nmethod, Artificial Data Point Generation in Clustered Latent Space (AGCL),\ndesigned to enhance classification performance on small medical datasets\nthrough synthetic data generation. The AGCL framework involves feature\nextraction, K-means clustering, cluster evaluation based on a class separation\nmetric, and the generation of synthetic data points from clusters with distinct\nclass representations. This method was applied to Parkinson's disease\nscreening, utilizing facial expression data, and evaluated across multiple\nmachine learning classifiers. Experimental results demonstrate that AGCL\nsignificantly improves classification accuracy compared to baseline, GN and\nkNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and\ncross-validation accuracy of 90.90% in majority voting over different emotions,\nconfirming its effectiveness in augmenting small datasets.\n","authors":["Yasaman Haghbin","Hadi Moradi","Reshad Hosseini"],"pdf_url":"https://arxiv.org/pdf/2409.17685v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17684v1","updated":"2024-09-26T09:51:07Z","published":"2024-09-26T09:51:07Z","title":"Preserving logical and functional dependencies in synthetic tabular data","summary":" Dependencies among attributes are a common aspect of tabular data. However,\nwhether existing tabular data generation algorithms preserve these dependencies\nwhile generating synthetic data is yet to be explored. In addition to the\nexisting notion of functional dependencies, we introduce the notion of logical\ndependencies among the attributes in this article. Moreover, we provide a\nmeasure to quantify logical dependencies among attributes in tabular data.\nUtilizing this measure, we compare several state-of-the-art synthetic data\ngeneration algorithms and test their capability to preserve logical and\nfunctional dependencies on several publicly available datasets. We demonstrate\nthat currently available synthetic tabular data generation algorithms do not\nfully preserve functional dependencies when they generate synthetic datasets.\nIn addition, we also showed that some tabular synthetic data generation models\ncan preserve inter-attribute logical dependencies. Our review and comparison of\nthe state-of-the-art reveal research needs and opportunities to develop\ntask-specific synthetic tabular data generation models.\n","authors":["Chaithra Umesh","Kristian Schultz","Manjunath Mahendra","Saparshi Bej","Olaf Wolkenhauer"],"pdf_url":"https://arxiv.org/pdf/2409.17684v1.pdf","comment":"Submitted to Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2208.13197v2","updated":"2024-09-26T09:40:31Z","published":"2022-08-28T10:47:32Z","title":"IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided\n Feature Extraction","summary":" Disruption prediction has made rapid progress in recent years, especially in\nmachine learning (ML)-based methods. Understanding why a predictor makes a\ncertain prediction can be as crucial as the prediction's accuracy for future\ntokamak disruption predictors. The purpose of most disruption predictors is\naccuracy or cross-machine capability. However, if a disruption prediction model\ncan be interpreted, it can tell why certain samples are classified as\ndisruption precursors. This allows us to tell the types of incoming disruption\nand gives us insight into the mechanism of disruption. This paper designs a\ndisruption predictor called Interpretable Disruption Predictor based On\nPhysics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction\nperformance of the model is effectively improved by extracting physics-guided\nfeatures. A high-performance model is required to ensure the validity of the\ninterpretation results. The interpretability study of IDP-PGFE provides an\nunderstanding of J-TEXT disruption and is generally consistent with existing\ncomprehension of disruption. IDP-PGFE has been applied to the disruption due to\ncontinuously increasing density towards density limit experiments on J-TEXT.\nThe time evolution of the PGFE features contribution demonstrates that the\napplication of ECRH triggers radiation-caused disruption, which lowers the\ndensity at disruption. While the application of RMP indeed raises the density\nlimit in J-TEXT. The interpretability study guides intuition on the physical\nmechanisms of density limit disruption that RMPs affect not only the MHD\ninstabilities but also the radiation profile, which delays density limit\ndisruption.\n","authors":["Chengshuo Shen","Wei Zheng","Yonghua Ding","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Li Gao","Zhipeng Chen","Zhoujun Yang","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2208.13197v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.17677v1","updated":"2024-09-26T09:36:47Z","published":"2024-09-26T09:36:47Z","title":"Optimal Memorization Capacity of Transformers","summary":" Recent research in the field of machine learning has increasingly focused on\nthe memorization capacity of Transformers, but how efficient they are is not\nyet well understood. We demonstrate that Transformers can memorize labels with\n$\\tilde{O}(\\sqrt{N})$ parameters in a next-token prediction setting for $N$\ninput sequences of length $n$, which is proved to be optimal up to logarithmic\nfactors. This indicates that Transformers can efficiently perform memorization\nwith little influence from the input length $n$ owing to the benefit of\nparameter sharing. We also analyze the memorization capacity in the\nsequence-to-sequence setting, and find that $\\tilde{O}(\\sqrt{nN})$ parameters\nare not only sufficient, but also necessary at least for Transformers with\nhardmax. These results suggest that while self-attention mechanisms can\nefficiently identify input sequences, the feed-forward network becomes a\nbottleneck when associating a label to each token.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13503v2","updated":"2024-09-26T09:26:05Z","published":"2024-09-20T13:44:00Z","title":"SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous\n Federated Learning Framework","summary":" Traditional federated learning (FL) frameworks rely heavily on terrestrial\nnetworks, where coverage limitations and increasing bandwidth congestion\nsignificantly hinder model convergence. Fortunately, the advancement of\nlow-Earth orbit (LEO) satellite networks offers promising new communication\navenues to augment traditional terrestrial FL. Despite this potential, the\nlimited satellite-ground communication bandwidth and the heterogeneous\noperating environments of ground devices-including variations in data,\nbandwidth, and computing power-pose substantial challenges for effective and\nrobust satellite-assisted FL. To address these challenges, we propose SatFed, a\nresource-efficient satellite-assisted heterogeneous FL framework. SatFed\nimplements freshness-based model prioritization queues to optimize the use of\nhighly constrained satellite-ground bandwidth, ensuring the transmission of the\nmost critical models. Additionally, a multigraph is constructed to capture\nreal-time heterogeneous relationships between devices, including data\ndistribution, terrestrial bandwidth, and computing capability. This multigraph\nenables SatFed to aggregate satellite-transmitted models into peer guidance,\nenhancing local training in heterogeneous environments. Extensive experiments\nwith real-world LEO satellite networks demonstrate that SatFed achieves\nsuperior performance and robustness compared to state-of-the-art benchmarks.\n","authors":["Yuxin Zhang","Zheng Lin","Zhe Chen","Zihan Fang","Wenjun Zhu","Xianhao Chen","Jin Zhao","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2409.13503v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.17663v1","updated":"2024-09-26T09:21:48Z","published":"2024-09-26T09:21:48Z","title":"Explanation Bottleneck Models","summary":" Recent concept-based interpretable models have succeeded in providing\nmeaningful explanations by pre-defined concept sets. However, the dependency on\nthe pre-defined concepts restricts the application because of the limited\nnumber of concepts for explanations. This paper proposes a novel interpretable\ndeep neural network called explanation bottleneck models (XBMs). XBMs generate\na text explanation from the input without pre-defined concepts and then predict\na final task prediction based on the generated explanation by leveraging\npre-trained vision-language encoder-decoder models. To achieve both the target\ntask performance and the explanation quality, we train XBMs through the target\ntask loss with the regularization penalizing the explanation decoder via the\ndistillation from the frozen pre-trained decoder. Our experiments, including a\ncomparison to state-of-the-art concept bottleneck models, confirm that XBMs\nprovide accurate and fluent natural language explanations without pre-defined\nconcept sets. Code will be available at https://github.com/yshinya6/xbm/.\n","authors":["Shin'ya Yamaguchi","Kosuke Nishida"],"pdf_url":"https://arxiv.org/pdf/2409.17663v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.14816v2","updated":"2024-09-26T09:11:28Z","published":"2024-09-23T08:46:15Z","title":"VARADE: a Variational-based AutoRegressive model for Anomaly Detection\n on the Edge","summary":" Detecting complex anomalies on massive amounts of data is a crucial task in\nIndustry 4.0, best addressed by deep learning. However, available solutions are\ncomputationally demanding, requiring cloud architectures prone to latency and\nbandwidth issues. This work presents VARADE, a novel solution implementing a\nlight autoregressive framework based on variational inference, which is best\nsuited for real-time execution on the edge. The proposed approach was validated\non a robotic arm, part of a pilot production line, and compared with several\nstate-of-the-art algorithms, obtaining the best trade-off between anomaly\ndetection accuracy, power consumption and inference frequency on two different\nedge platforms.\n","authors":["Alessio Mascolini","Sebastiano Gaiardelli","Francesco Ponzio","Nicola Dall'Ora","Enrico Macii","Sara Vinco","Santa Di Cataldo","Franco Fummi"],"pdf_url":"https://arxiv.org/pdf/2409.14816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15897v2","updated":"2024-09-26T09:04:58Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v2.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2409.15246v2","updated":"2024-09-26T08:48:03Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems play a crucial role in achieving Sustainable\nDevelopment Goals by collecting and analyzing vital global data through\nsatellite networks. These systems are essential for tasks like mapping,\ndisaster monitoring, and resource management, but they face challenges in\nprocessing and transmitting large volumes of EO data, especially in specialized\nfields such as agriculture and real-time disaster response. Domain-adapted\nLarge Language Models (LLMs) provide a promising solution by facilitating data\nfusion between extensive EO data and semantic EO data. By improving integration\nand interpretation of diverse datasets, LLMs address the challenges of\nprocessing specialized information in agriculture and disaster response\napplications. This fusion enhances the accuracy and relevance of transmitted\ndata. This paper presents a framework for semantic communication in EO\nsatellite networks, aimed at improving data transmission efficiency and overall\nsystem performance through cognitive processing techniques. The proposed system\nemploys Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic\nData Augmentation (SA) to focus on relevant information while minimizing\ncommunication overhead. By integrating cognitive semantic processing and\ninter-satellite links, the framework enhances the analysis and transmission of\nmultispectral satellite imagery, improving object detection, pattern\nrecognition, and real-time decision-making. The introduction of Cognitive\nSemantic Augmentation (CSA) allows satellites to process and transmit semantic\ninformation, boosting adaptability to changing environments and application\nneeds. This end-to-end architecture is tailored for next-generation satellite\nnetworks, such as those supporting 6G, and demonstrates significant\nimprovements in efficiency and accuracy.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v2.pdf","comment":"18 pages, 10 figures, magazine"},{"id":"http://arxiv.org/abs/2409.17643v1","updated":"2024-09-26T08:46:48Z","published":"2024-09-26T08:46:48Z","title":"Efficient Fairness-Performance Pareto Front Computation","summary":" There is a well known intrinsic trade-off between the fairness of a\nrepresentation and the performance of classifiers derived from the\nrepresentation. Due to the complexity of optimisation algorithms in most modern\nrepresentation learning approaches, for a given method it may be non-trivial to\ndecide whether the obtained fairness-performance curve of the method is\noptimal, i.e., whether it is close to the true Pareto front for these\nquantities for the underlying data distribution.\n In this paper we propose a new method to compute the optimal Pareto front,\nwhich does not require the training of complex representation models. We show\nthat optimal fair representations possess several useful structural properties,\nand that these properties enable a reduction of the computation of the Pareto\nFront to a compact discrete problem. We then also show that these compact\napproximating problems can be efficiently solved via off-the shelf\nconcave-convex programming methods.\n Since our approach is independent of the specific model of representations,\nit may be used as the benchmark to which representation learning algorithms may\nbe compared. We experimentally evaluate the approach on a number of real world\nbenchmark datasets.\n","authors":["Mark Kozdoba","Binyamin Perets","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2409.17643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02733v3","updated":"2024-09-26T08:45:22Z","published":"2023-06-05T09:29:46Z","title":"Realising Synthetic Active Inference Agents, Part II: Variational\n Message Updates","summary":" The Free Energy Principle (FEP) describes (biological) agents as minimising a\nvariational Free Energy (FE) with respect to a generative model of their\nenvironment. Active Inference (AIF) is a corollary of the FEP that describes\nhow agents explore and exploit their environment by minimising an expected FE\nobjective. In two related papers, we describe a scalable, epistemic approach to\nsynthetic AIF, by message passing on free-form Forney-style Factor Graphs\n(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation\nthat visually represents (generalised) FE objectives for AIF. The current paper\n(part II) derives message passing algorithms that minimise (generalised) FE\nobjectives on a CFFG by variational calculus. A comparison between simulated\nBethe and generalised FE agents illustrates how the message passing approach to\nsynthetic AIF induces epistemic behaviour on a T-maze navigation task.\nExtension of the T-maze simulation to 1) learning goal statistics, and 2) a\nmulti-agent bargaining setting, illustrate how this approach encourages reuse\nof nodes and updates in alternative settings. With a full message passing\naccount of synthetic AIF agents, it becomes possible to derive and reuse\nmessage updates across models and move closer to industrial applications of\nsynthetic AIF.\n","authors":["Thijs van de Laar","Magnus Koudahl","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2306.02733v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17635v1","updated":"2024-09-26T08:32:31Z","published":"2024-09-26T08:32:31Z","title":"FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates","summary":" This paper introduces FlowMAC, a novel neural audio codec for high-quality\ngeneral audio compression at low bit rates based on conditional flow matching\n(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder.\nAt inference time the decoder integrates a continuous normalizing flow via an\nODE solver to generate a high-quality mel spectrogram. This is the first time\nthat a CFM-based approach is applied to general audio coding, enabling a\nscalable, simple and memory efficient training. Our subjective evaluations show\nthat FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based\nand DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC\noffers a tunable inference pipeline, which permits to trade off complexity and\nquality. This enables real-time coding on CPU, while maintaining high\nperceptual quality.\n","authors":["Nicola Pia","Martin Strauss","Markus Multrus","Bernd Edler"],"pdf_url":"https://arxiv.org/pdf/2409.17635v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.17632v1","updated":"2024-09-26T08:28:14Z","published":"2024-09-26T08:28:14Z","title":"Model-Free Stochastic Process Modeling and Optimization using\n Normalizing Flows","summary":" Real-world chemical processes often exhibit stochastic dynamics with\nnon-trivial correlations and state-dependent fluctuations. However, most\nprocess models simply add stationary noise terms to a deterministic prediction,\nwhich can lead to inaccurate predictions. This work proposes using conditional\nnormalizing flows as discrete-time models (DTMs) to learn the stochastic\ndynamics of chemical processes. Normalizing flows learn an explicit expression\nof the system states' probability density function (PDF) given prior states and\ncontrol inputs. The resulting model naturally allows for formulating stochastic\nand probabilistic setpoint-tracking objectives and chance constraints. In\napplications to a continuous reactor and a reactor cascade, the normalizing\nflow yields stable simulations over long time horizons and high-quality results\nin stochastic and probabilistic MPC formulation for open-loop control.\nFurthermore, a chance-constrained optimization finds reliable startup controls\nfor the reactor cascade with stochastic reactions. In conclusion, the\nconditional normalizing flow presents an excellent choice for modeling\nnonlinear stochastic dynamics.\n","authors":["Eike Cramer"],"pdf_url":"https://arxiv.org/pdf/2409.17632v1.pdf","comment":"13 pages, 7 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2409.17628v1","updated":"2024-09-26T08:22:09Z","published":"2024-09-26T08:22:09Z","title":"Convolutional Signal Propagation: A Simple Scalable Algorithm for\n Hypergraphs","summary":" Last decade has seen the emergence of numerous methods for learning on\ngraphs, particularly Graph Neural Networks (GNNs). These methods, however, are\noften not directly applicable to more complex structures like bipartite graphs\n(equivalent to hypergraphs), which represent interactions among two entity\ntypes (e.g. a user liking a movie). This paper proposes Convolutional Signal\nPropagation (CSP), a non-parametric simple and scalable method that natively\noperates on bipartite graphs (hypergraphs) and can be implemented with just a\nfew lines of code. After defining CSP, we demonstrate its relationship with\nwell-established methods like label propagation, Naive Bayes, and Hypergraph\nConvolutional Networks. We evaluate CSP against several reference methods on\nreal-world datasets from multiple domains, focusing on retrieval and\nclassification tasks. Our results show that CSP offers competitive performance\nwhile maintaining low computational complexity, making it an ideal first choice\nas a baseline for hypergraph node classification and retrieval. Moreover,\ndespite operating on hypergraphs, CSP achieves good results in tasks typically\nnot associated with hypergraphs, such as natural language processing.\n","authors":["Pavel Procházka","Marek Dědič","Lukáš Bajer"],"pdf_url":"https://arxiv.org/pdf/2409.17628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11531v2","updated":"2024-09-26T08:20:59Z","published":"2023-09-20T10:50:28Z","title":"EPTQ: Enhanced Post-Training Quantization via Hessian-guided\n Network-wise Optimization","summary":" Quantization is a key method for deploying deep neural networks on edge\ndevices with limited memory and computation resources. Recent improvements in\nPost-Training Quantization (PTQ) methods were achieved by an additional local\noptimization process for learning the weight quantization rounding policy.\nHowever, a gap exists when employing network-wise optimization with small\nrepresentative datasets. In this paper, we propose a new method for enhanced\nPTQ (EPTQ) that employs a network-wise quantization optimization process, which\nbenefits from considering cross-layer dependencies during optimization. EPTQ\nenables network-wise optimization with a small representative dataset using a\nnovel sample-layer attention score based on a label-free Hessian matrix upper\nbound. The label-free approach makes our method suitable for the PTQ scheme. We\ngive a theoretical analysis for the said bound and use it to construct a\nknowledge distillation loss that guides the optimization to focus on the more\nsensitive layers and samples. In addition, we leverage the Hessian upper bound\nto improve the weight quantization parameters selection by focusing on the more\nsensitive elements in the weight tensors. Empirically, by employing EPTQ we\nachieve state-of-the-art results on various models, tasks, and datasets,\nincluding ImageNet classification, COCO object detection, and Pascal-VOC for\nsemantic segmentation.\n","authors":["Ofir Gordon","Elad Cohen","Hai Victor Habi","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2309.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17625v1","updated":"2024-09-26T08:20:05Z","published":"2024-09-26T08:20:05Z","title":"Benign or Not-Benign Overfitting in Token Selection of Attention\n Mechanism","summary":" Modern over-parameterized neural networks can be trained to fit the training\ndata perfectly while still maintaining a high generalization performance. This\n\"benign overfitting\" phenomenon has been studied in a surge of recent\ntheoretical work; however, most of these studies have been limited to linear\nmodels or two-layer neural networks. In this work, we analyze benign\noverfitting in the token selection mechanism of the attention architecture,\nwhich characterizes the success of transformer models. We first show the\nexistence of a benign overfitting solution and explain its mechanism in the\nattention architecture. Next, we discuss whether the model converges to such a\nsolution, raising the difficulties specific to the attention architecture. We\nthen present benign overfitting cases and not-benign overfitting cases by\nconditioning different scenarios based on the behavior of attention\nprobabilities during training. To the best of our knowledge, this is the first\nstudy to characterize benign overfitting for the attention mechanism.\n","authors":["Keitaro Sakamoto","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17622v1","updated":"2024-09-26T08:16:59Z","published":"2024-09-26T08:16:59Z","title":"Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric\n GNNs","summary":" Geometric graph neural networks (GNNs) have emerged as powerful tools for\nmodeling molecular geometry. However, they encounter limitations in effectively\ncapturing long-range interactions in large molecular systems. To address this\nchallenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs\nto expand the scope of their capabilities by incorporating mesh points\nalongside atoms and reimaging traditional mathematical operations in a\ntrainable manner. Neural P$^3$M exhibits flexibility across a wide range of\nmolecular systems and demonstrates remarkable accuracy in predicting energies\nand forces, outperforming on benchmarks such as the MD22 dataset. It also\nachieves an average improvement of 22% on the OE62 dataset while integrating\nwith various architectures.\n","authors":["Yusong Wang","Chaoran Cheng","Shaoning Li","Yuxuan Ren","Bin Shao","Ge Liu","Pheng-Ann Heng","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.17622v1.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.16959v2","updated":"2024-09-26T08:12:59Z","published":"2024-06-21T03:21:22Z","title":"Recurrent Stochastic Configuration Networks for Temporal Data Analytics","summary":" Temporal data modelling techniques with neural networks are useful in many\ndomain applications, including time-series forecasting and control engineering.\nThis paper aims at developing a recurrent version of stochastic configuration\nnetworks (RSCNs) for problem solving, where we have no underlying assumption on\nthe dynamic orders of the input variables. Given a collection of historical\ndata, we first build an initial RSCN model in the light of a supervisory\nmechanism, followed by an online update of the output weights by using a\nprojection algorithm. Some theoretical results are established, including the\necho state property, the universal approximation property of RSCNs for both the\noffline and online learnings, and the convergence of the output weights. The\nproposed RSCN model is remarkably distinguished from the well-known echo state\nnetworks (ESNs) in terms of the way of assigning the input random weight matrix\nand a special structure of the random feedback matrix. A comprehensive\ncomparison study among the long short-term memory (LSTM) network, the original\nESN, and several state-of-the-art ESN methods such as the simple cycle\nreservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN)\nand RSCN is carried out. Numerical results clearly indicate that the proposed\nRSCN performs favourably over all of the datasets.\n","authors":["Dianhui Wang","Gang Dang"],"pdf_url":"https://arxiv.org/pdf/2406.16959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v1","updated":"2024-09-26T08:03:19Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic dataset that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15254v3","updated":"2024-09-26T08:01:39Z","published":"2024-09-23T17:53:42Z","title":"Archon: An Architecture Search Framework for Inference-Time Techniques","summary":" Inference-time techniques are emerging as highly effective tools to increase\nlarge language model (LLM) capabilities. However, there is still limited\nunderstanding of the best practices for developing systems that combine\ninference-time techniques with one or more LLMs, with challenges including: (1)\neffectively allocating inference compute budget, (2) understanding the\ninteractions between different combinations of inference-time techniques and\ntheir impact on downstream performance, and 3) efficiently searching over the\nlarge space of model choices, inference-time techniques, and their\ncompositions. To address these challenges, we introduce Archon, an automated\nframework for designing inference-time architectures. Archon defines an\nextensible design space, encompassing methods such as generation ensembling,\nmulti-sampling, ranking, fusion, critiquing, verification, and unit testing. It\nthen transforms the problem of selecting and combining LLMs and inference-time\ntechniques into a hyperparameter optimization objective. To optimize this\nobjective, we introduce automated Inference-Time Architecture Search (ITAS)\nalgorithms. Given target benchmark(s), an inference compute budget, and\navailable LLMs, ITAS outputs optimized architectures. We evaluate Archon\narchitectures across a wide range of instruction-following and reasoning\nbenchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval,\nMixEval Hard, MATH, and CodeContests. We show that automatically designed\ninference-time architectures by Archon outperform strong models such as GPT-4o\nand Claude 3.5 Sonnet on these benchmarks, achieving an average increase of\n15.1 and 11.2 percentage points with all-source models and open-source models,\nrespectively. We make our code and datasets available publicly on Github:\nhttps://github.com/ScalingIntelligence/Archon.\n","authors":["Jon Saad-Falcon","Adrian Gamarra Lafuente","Shlok Natarajan","Nahum Maru","Hristo Todorov","Etash Guha","E. Kelly Buchanan","Mayee Chen","Neel Guha","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2409.15254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16341v2","updated":"2024-09-26T07:54:10Z","published":"2024-09-24T17:20:02Z","title":"Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs","summary":" Training large language models (LLMs) for external tool usage is a rapidly\nexpanding field, with recent research focusing on generating synthetic data to\naddress the shortage of available data. However, the absence of systematic data\nquality checks poses complications for properly training and testing models. To\nthat end, we propose two approaches for assessing the reliability of data for\ntraining LLMs to use external tools. The first approach uses intuitive,\nhuman-defined correctness criteria. The second approach uses a model-driven\nassessment with in-context evaluation. We conduct a thorough evaluation of data\nquality on two popular benchmarks, followed by an extrinsic evaluation that\nshowcases the impact of data quality on model performance. Our results\ndemonstrate that models trained on high-quality data outperform those trained\non unvalidated data, even when trained with a smaller quantity of data. These\nfindings empirically support the significance of assessing and ensuring the\nreliability of training data for tool-using LLMs.\n","authors":["Shadi Iskander","Nachshon Cohen","Zohar Karnin","Ori Shapira","Sofia Tolmach"],"pdf_url":"https://arxiv.org/pdf/2409.16341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04428v2","updated":"2024-09-26T07:53:04Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v2.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2408.03944v2","updated":"2024-09-26T07:47:50Z","published":"2024-07-22T03:56:27Z","title":"Improving Fast Adversarial Training Paradigm: An Example Taxonomy\n Perspective","summary":" While adversarial training is an effective defense method against adversarial\nattacks, it notably increases the training cost. To this end, fast adversarial\ntraining (FAT) is presented for efficient training and has become a hot\nresearch topic. However, FAT suffers from catastrophic overfitting, which leads\nto a performance drop compared with multi-step adversarial training. However,\nthe cause of catastrophic overfitting remains unclear and lacks exploration. In\nthis paper, we present an example taxonomy in FAT, which identifies that\ncatastrophic overfitting is caused by the imbalance between the inner and outer\noptimization in FAT. Furthermore, we investigated the impact of varying degrees\nof training loss, revealing a correlation between training loss and\ncatastrophic overfitting. Based on these observations, we redesign the loss\nfunction in FAT with the proposed dynamic label relaxation to concentrate the\nloss range and reduce the impact of misclassified examples. Meanwhile, we\nintroduce batch momentum initialization to enhance the diversity to prevent\ncatastrophic overfitting in an efficient manner. Furthermore, we also propose\nCatastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate\ntraining strategy for examples based on their loss degree. Our proposed method,\nnamed example taxonomy aware FAT (ETA), establishes an improved paradigm for\nFAT. Experiment results demonstrate our ETA achieves state-of-the-art\nperformance. Comprehensive experiments on four standard datasets demonstrate\nthe competitiveness of our proposed method.\n","authors":["Jie Gui","Chengze Jiang","Minjing Dong","Kun Tong","Xinli Shi","Yuan Yan Tang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03944v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.17605v1","updated":"2024-09-26T07:43:12Z","published":"2024-09-26T07:43:12Z","title":"Good Data Is All Imitation Learning Needs","summary":" In this paper, we address the limitations of traditional teacher-student\nmodels, imitation learning, and behaviour cloning in the context of\nAutonomous/Automated Driving Systems (ADS), where these methods often struggle\nwith incomplete coverage of real-world scenarios. To enhance the robustness of\nsuch models, we introduce the use of Counterfactual Explanations (CFEs) as a\nnovel data augmentation technique for end-to-end ADS. CFEs, by generating\ntraining samples near decision boundaries through minimal input modifications,\nlead to a more comprehensive representation of expert driver strategies,\nparticularly in safety-critical scenarios. This approach can therefore help\nimprove the model's ability to handle rare and challenging driving events, such\nas anticipating darting out pedestrians, ultimately leading to safer and more\ntrustworthy decision-making for ADS. Our experiments in the CARLA simulator\ndemonstrate that CF-Driver outperforms the current state-of-the-art method,\nachieving a higher driving score and lower infraction rates. Specifically,\nCF-Driver attains a driving score of 84.2, surpassing the previous best model\nby 15.02 percentage points. These results highlight the effectiveness of\nincorporating CFEs in training end-to-end ADS. To foster further research, the\nCF-Driver code is made publicly available.\n","authors":["Amir Samadi","Konstantinos Koufos","Kurt Debattista","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2409.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17604v1","updated":"2024-09-26T07:40:47Z","published":"2024-09-26T07:40:47Z","title":"RmGPT: Rotating Machinery Generative Pretrained Model","summary":" In industry, the reliability of rotating machinery is critical for production\nefficiency and safety. Current methods of Prognostics and Health Management\n(PHM) often rely on task-specific models, which face significant challenges in\nhandling diverse datasets with varying signal characteristics, fault modes and\noperating conditions. Inspired by advancements in generative pretrained models,\nwe propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT\nintroduces a novel token-based framework, incorporating Signal Tokens, Prompt\nTokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous\ndata within a unified model architecture. We leverage self-supervised learning\nfor robust feature extraction and introduce a next signal token prediction\npretraining strategy, alongside efficient prompt learning for task-specific\nadaptation. Extensive experiments demonstrate that RmGPT significantly\noutperforms state-of-the-art algorithms, achieving near-perfect accuracy in\ndiagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT\nexcels in few-shot learning scenarios, achieving 92% accuracy in 16-class\none-shot experiments, highlighting its adaptability and robustness. This work\nestablishes RmGPT as a powerful PHM foundation model for rotating machinery,\nadvancing the scalability and generalizability of PHM solutions.\n","authors":["Yilin Wang","Yifei Yu","Kong Sun","Peixuan Lei","Yuxuan Zhang","Enrico Zio","Aiguo Xia","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.17604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16206v2","updated":"2024-09-26T07:32:09Z","published":"2024-05-25T12:35:31Z","title":"GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine\n Learning","summary":" Glycans are basic biomolecules and perform essential functions within living\norganisms. The rapid increase of functional glycan data provides a good\nopportunity for machine learning solutions to glycan understanding. However,\nthere still lacks a standard machine learning benchmark for glycan function\nprediction. In this work, we fill this blank by building a comprehensive\nbenchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark\nconsists of diverse types of tasks including glycan taxonomy prediction, glycan\nimmunogenicity prediction, glycosylation type prediction, and protein-glycan\ninteraction prediction. Glycans can be represented by both sequences and graphs\nin GlycanML, which enables us to extensively evaluate sequence-based models and\ngraph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently\nperforming eight glycan taxonomy prediction tasks, we introduce the\nGlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental\nresults show the superiority of modeling glycans with multi-relational GNNs,\nand suitable MTL methods can further boost model performance. We provide all\ndatasets and source codes at https://github.com/GlycanML/GlycanML and maintain\na leaderboard at https://GlycanML.github.io/project\n","authors":["Minghao Xu","Yunteng Geng","Yihang Zhang","Ling Yang","Jian Tang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.16206v2.pdf","comment":"Research project paper. All code and data are released"},{"id":"http://arxiv.org/abs/2409.17592v1","updated":"2024-09-26T07:19:12Z","published":"2024-09-26T07:19:12Z","title":"Deep Manifold Part 1: Anatomy of Neural Network Manifold","summary":" Based on the numerical manifold method principle, we developed a mathematical\nframework of a neural network manifold: Deep Manifold and discovered that\nneural networks: 1) is numerical computation combining forward and inverse; 2)\nhave near infinite degrees of freedom; 3) exponential learning capacity with\ndepth; 4) have self-progressing boundary conditions; 5) has training hidden\nbottleneck. We also define two concepts: neural network learning space and deep\nmanifold space and introduce two concepts: neural network intrinsic pathway and\nfixed point. We raise three fundamental questions: 1). What is the training\ncompletion definition; 2). where is the deep learning convergence point (neural\nnetwork fixed point); 3). How important is token timestamp in training data\ngiven negative time is critical in inverse problem.\n","authors":["Max Y. Ma","Gen-Hua Shi"],"pdf_url":"https://arxiv.org/pdf/2409.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17591v1","updated":"2024-09-26T07:16:38Z","published":"2024-09-26T07:16:38Z","title":"Conjugate Bayesian Two-step Change Point Detection for Hawkes Process","summary":" The Bayesian two-step change point detection method is popular for the Hawkes\nprocess due to its simplicity and intuitiveness. However, the non-conjugacy\nbetween the point process likelihood and the prior requires most existing\nBayesian two-step change point detection methods to rely on non-conjugate\ninference methods. These methods lack analytical expressions, leading to low\ncomputational efficiency and impeding timely change point detection. To address\nthis issue, this work employs data augmentation to propose a conjugate Bayesian\ntwo-step change point detection method for the Hawkes process, which proves to\nbe more accurate and efficient. Extensive experiments on both synthetic and\nreal data demonstrate the superior effectiveness and efficiency of our method\ncompared to baseline methods. Additionally, we conduct ablation studies to\nexplore the robustness of our method concerning various hyperparameters. Our\ncode is publicly available at https://github.com/Aurora2050/CoBay-CPD.\n","authors":["Zeyue Zhang","Xiaoling Lu","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17591v1.pdf","comment":"10 pages, accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17587v1","updated":"2024-09-26T07:07:08Z","published":"2024-09-26T07:07:08Z","title":"Multimodal Banking Dataset: Understanding Client Needs through Event\n Sequences","summary":" Financial organizations collect a huge amount of data about clients that\ntypically has a temporal (sequential) structure and is collected from various\nsources (modalities). Due to privacy issues, there are no large-scale\nopen-source multimodal datasets of event sequences, which significantly limits\nthe research in this area. In this paper, we present the industrial-scale\npublicly available multimodal banking dataset, MBD, that contains more than\n1.5M corporate clients with several modalities: 950M bank transactions, 1B geo\nposition events, 5M embeddings of dialogues with technical support and monthly\naggregated purchases of four bank's products. All entries are properly\nanonymized from real proprietary bank data. Using this dataset, we introduce a\nnovel benchmark with two business tasks: campaigning (purchase prediction in\nthe next month) and matching of clients. We provide numerical results that\ndemonstrate the superiority of our multi-modal baselines over single-modal\ntechniques for each task. As a result, the proposed dataset can open new\nperspectives and facilitate the future development of practically important\nlarge-scale multimodal algorithms for event sequences.\n HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD\n Github Link: https://github.com/Dzhambo/MBD\n","authors":["Mollaev Dzhambulat","Alexander Kostin","Postnova Maria","Ivan Karpukhin","Ivan A Kireev","Gleb Gusev","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2409.17587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06379v3","updated":"2024-09-26T07:05:47Z","published":"2023-10-10T07:43:41Z","title":"Understanding the Expressivity and Trainability of Fourier Neural\n Operator: A Mean-Field Perspective","summary":" In this paper, we explores the expressivity and trainability of the Fourier\nNeural Operator (FNO). We establish a mean-field theory for the FNO, analyzing\nthe behavior of the random FNO from an edge of chaos perspective. Our\ninvestigation into the expressivity of a random FNO involves examining the\nordered-chaos phase transition of the network based on the weight distribution.\nThis phase transition demonstrates characteristics unique to the FNO, induced\nby mode truncation, while also showcasing similarities to those of densely\nconnected networks. Furthermore, we identify a connection between expressivity\nand trainability: the ordered and chaotic phases correspond to regions of\nvanishing and exploding gradients, respectively. This finding provides a\npractical prerequisite for the stable training of the FNO. Our experimental\nresults corroborate our theoretical findings.\n","authors":["Takeshi Koshizuka","Masahiro Fujisawa","Yusuke Tanaka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2310.06379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17583v1","updated":"2024-09-26T07:01:29Z","published":"2024-09-26T07:01:29Z","title":"Let the Quantum Creep In: Designing Quantum Neural Network Models by\n Gradually Swapping Out Classical Components","summary":" Artificial Intelligence (AI), with its multiplier effect and wide\napplications in multiple areas, could potentially be an important application\nof quantum computing. Since modern AI systems are often built on neural\nnetworks, the design of quantum neural networks becomes a key challenge in\nintegrating quantum computing into AI. To provide a more fine-grained\ncharacterisation of the impact of quantum components on the performance of\nneural networks, we propose a framework where classical neural network layers\nare gradually replaced by quantum layers that have the same type of input and\noutput while keeping the flow of information between layers unchanged,\ndifferent from most current research in quantum neural network, which favours\nan end-to-end quantum model. We start with a simple three-layer classical\nneural network without any normalisation layers or activation functions, and\ngradually change the classical layers to the corresponding quantum versions. We\nconduct numerical experiments on image classification datasets such as the\nMNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of\nperformance brought by the systematic introduction of quantum components.\nThrough this framework, our research sheds new light on the design of future\nquantum neural network models where it could be more favourable to search for\nmethods and frameworks that harness the advantages from both the classical and\nquantum worlds.\n","authors":["Peiyong Wang","Casey. R. Myers","Lloyd C. L. Hollenberg","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2409.17583v1.pdf","comment":"50 pages (including Appendix), many figures, accepted as a poster on\n QTML2024. Code available at\n https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In"},{"id":"http://arxiv.org/abs/2409.17582v1","updated":"2024-09-26T07:01:06Z","published":"2024-09-26T07:01:06Z","title":"Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware\n Decision Boundary Adjustment","summary":" Real-world data distributions are often highly skewed. This has spurred a\ngrowing body of research on long-tailed recognition to address this imbalance\nin training classification models. Among the methods studied, multiplicative\nlogit adjustment (MLA) stands out as a simple and effective method. However, it\nlacks theoretical guarantees, which raises concerns about the optimality of its\nadjustment method. We provide a theoretical justification for the effectiveness\nof MLA with the following two-step theory. First, we develop a theory that\nadjusts optimal decision boundaries by estimating feature spread on the basis\nof neural collapse. Then, we demonstrate that MLA approximates this optimal\nmethod. Additionally, through experiments on long-tailed datasets, we\nillustrate the practical usefulness of MLA under more realistic conditions. We\nalso offer experimental insights to guide the tuning of MLA's hyperparameters.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2409.17582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17567v1","updated":"2024-09-26T06:28:56Z","published":"2024-09-26T06:28:56Z","title":"Derandomizing Multi-Distribution Learning","summary":" Multi-distribution or collaborative learning involves learning a single\npredictor that works well across multiple data distributions, using samples\nfrom each during training. Recent research on multi-distribution learning,\nfocusing on binary loss and finite VC dimension classes, has shown near-optimal\nsample complexity that is achieved with oracle efficient algorithms. That is,\nthese algorithms are computationally efficient given an efficient ERM for the\nclass. Unlike in classical PAC learning, where the optimal sample complexity is\nachieved with deterministic predictors, current multi-distribution learning\nalgorithms output randomized predictors. This raises the question: can these\nalgorithms be derandomized to produce a deterministic predictor for multiple\ndistributions? Through a reduction to discrepancy minimization, we show that\nderandomizing multi-distribution learning is computationally hard, even when\nERM is computationally efficient. On the positive side, we identify a\nstructural condition enabling an efficient black-box reduction, converting\nexisting randomized multi-distribution predictors into deterministic ones.\n","authors":["Kasper Green Larsen","Omar Montasser","Nikita Zhivotovskiy"],"pdf_url":"https://arxiv.org/pdf/2409.17567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17565v1","updated":"2024-09-26T06:27:26Z","published":"2024-09-26T06:27:26Z","title":"Pixel-Space Post-Training of Latent Diffusion Models","summary":" Latent diffusion models (LDMs) have made significant advancements in the\nfield of image generation in recent years. One major advantage of LDMs is their\nability to operate in a compressed latent space, allowing for more efficient\ntraining and deployment. However, despite these advantages, challenges with\nLDMs still remain. For example, it has been observed that LDMs often generate\nhigh-frequency details and complex compositions imperfectly. We hypothesize\nthat one reason for these flaws is due to the fact that all pre- and\npost-training of LDMs are done in latent space, which is typically $8 \\times 8$\nlower spatial-resolution than the output images. To address this issue, we\npropose adding pixel-space supervision in the post-training process to better\npreserve high-frequency details. Experimentally, we show that adding a\npixel-space objective significantly improves both supervised quality\nfine-tuning and preference-based post-training by a large margin on a\nstate-of-the-art DiT transformer and U-Net diffusion models in both visual\nquality and visual flaw metrics, while maintaining the same text alignment\nquality.\n","authors":["Christina Zhang","Simran Motwani","Matthew Yu","Ji Hou","Felix Juefei-Xu","Sam Tsai","Peter Vajda","Zijian He","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16997v2","updated":"2024-09-26T06:13:04Z","published":"2024-09-25T15:02:25Z","title":"INT-FlashAttention: Enabling Flash Attention for INT8 Quantization","summary":" As the foundation of large language models (LLMs), self-attention module\nfaces the challenge of quadratic time and memory complexity with respect to\nsequence length. FlashAttention accelerates attention computation and reduces\nits memory usage by leveraging the GPU memory hierarchy. A promising research\ndirection is to integrate FlashAttention with quantization methods. This paper\nintroduces INT-FlashAttention, the first INT8 quantization architecture\ncompatible with the forward workflow of FlashAttention, which significantly\nimproves the inference speed of FlashAttention on Ampere GPUs. We implement our\nINT-FlashAttention prototype with fully INT8 activations and general\nmatrix-multiplication (GEMM) kernels, making it the first attention operator\nwith fully INT8 input. As a general token-level post-training quantization\nframework, INT-FlashAttention is also compatible with other data formats like\nINT4, etc. Experimental results show INT-FlashAttention achieves 72% faster\ninference speed and 82% smaller quantization error compared to standard\nFlashAttention with FP16 and FP8 data format.\n","authors":["Shimao Chen","Zirui Liu","Zhiying Wu","Ce Zheng","Peizhuang Cong","Zihan Jiang","Yuhan Wu","Lei Su","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.16997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17557v1","updated":"2024-09-26T06:10:29Z","published":"2024-09-26T06:10:29Z","title":"Joint Source-Channel Coding: Fundamentals and Recent Progress in\n Practical Designs","summary":" Semantic- and task-oriented communication has emerged as a promising approach\nto reducing the latency and bandwidth requirements of next-generation mobile\nnetworks by transmitting only the most relevant information needed to complete\na specific task at the receiver. This is particularly advantageous for\nmachine-oriented communication of high data rate content, such as images and\nvideos, where the goal is rapid and accurate inference, rather than perfect\nsignal reconstruction. While semantic- and task-oriented compression can be\nimplemented in conventional communication systems, joint source-channel coding\n(JSCC) offers an alternative end-to-end approach by optimizing compression and\nchannel coding together, or even directly mapping the source signal to the\nmodulated waveform. Although all digital communication systems today rely on\nseparation, thanks to its modularity, JSCC is known to achieve higher\nperformance in finite blocklength scenarios, and to avoid cliff and the\nlevelling-off effects in time-varying channel scenarios. This article provides\nan overview of the information theoretic foundations of JSCC, surveys practical\nJSCC designs over the decades, and discusses the reasons for their limited\nadoption in practical systems. We then examine the recent resurgence of JSCC,\ndriven by the integration of deep learning techniques, particularly through\nDeepJSCC, highlighting its many surprising advantages in various scenarios.\nFinally, we discuss why it may be time to reconsider today's strictly separate\narchitectures, and reintroduce JSCC to enable high-fidelity, low-latency\ncommunications in critical applications such as autonomous driving, drone\nsurveillance, or wearable systems.\n","authors":["Deniz Gündüz","Michèle A. Wigger","Tze-Yang Tung","Ping Zhang","Yong Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.17557v1.pdf","comment":"Under review for possible publication"},{"id":"http://arxiv.org/abs/2307.08038v2","updated":"2024-09-26T06:02:24Z","published":"2023-07-16T13:34:44Z","title":"Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind\n Fields","summary":" High spatial resolution wind data are essential for a wide range of\napplications in climate, oceanographic and meteorological studies. Large-scale\nspatial interpolation or downscaling of bivariate wind fields having velocity\nin two dimensions is a challenging task because wind data tend to be\nnon-Gaussian with high spatial variability and heterogeneity. In spatial\nstatistics, cokriging is commonly used for predicting bivariate spatial fields.\nHowever, the cokriging predictor is not optimal except for Gaussian processes.\nAdditionally, cokriging is computationally prohibitive for large datasets. In\nthis paper, we propose a method, called bivariate DeepKriging, which is a\nspatially dependent deep neural network (DNN) with an embedding layer\nconstructed by spatial radial basis functions for bivariate spatial data\nprediction. We then develop a distribution-free uncertainty quantification\nmethod based on bootstrap and ensemble DNN. Our proposed approach outperforms\nthe traditional cokriging predictor with commonly used covariance functions,\nsuch as the linear model of co-regionalization and flexible bivariate Mat\\'ern\ncovariance. We demonstrate the computational efficiency and scalability of the\nproposed DNN model, with computations that are, on average, 20 times faster\nthan those of conventional techniques. We apply the bivariate DeepKriging\nmethod to the wind data over the Middle East region at 506,771 locations. The\nprediction performance of the proposed method is superior over the cokriging\npredictors and dramatically reduces computation time.\n","authors":["Pratik Nag","Ying Sun","Brian J Reich"],"pdf_url":"https://arxiv.org/pdf/2307.08038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17555v1","updated":"2024-09-26T05:57:35Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code will be\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v1.pdf","comment":"Accepted to NeurIPS 2024. The source code will be available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2406.14990v2","updated":"2024-09-26T05:51:20Z","published":"2024-06-21T09:03:37Z","title":"Learning Variable Compliance Control From a Few Demonstrations for\n Bimanual Robot with Haptic Feedback Teleoperation System","summary":" Automating dexterous, contact-rich manipulation tasks using rigid robots is a\nsignificant challenge in robotics. Rigid robots, defined by their actuation\nthrough position commands, face issues of excessive contact forces due to their\ninability to adapt to contact with the environment, potentially causing damage.\nWhile compliance control schemes have been introduced to mitigate these issues\nby controlling forces via external sensors, they are hampered by the need for\nfine-tuning task-specific controller parameters. Learning from Demonstrations\n(LfD) offers an intuitive alternative, allowing robots to learn manipulations\nthrough observed actions. In this work, we introduce a novel system to enhance\nthe teaching of dexterous, contact-rich manipulations to rigid robots. Our\nsystem is twofold: firstly, it incorporates a teleoperation interface utilizing\nVirtual Reality (VR) controllers, designed to provide an intuitive and\ncost-effective method for task demonstration with haptic feedback. Secondly, we\npresent Comp-ACT (Compliance Control via Action Chunking with Transformers), a\nmethod that leverages the demonstrations to learn variable compliance control\nfrom a few demonstrations. Our methods have been validated across various\ncomplex contact-rich manipulation tasks using single-arm and bimanual robot\nsetups in simulated and real-world environments, demonstrating the\neffectiveness of our system in teaching robots dexterous manipulations with\nenhanced adaptability and safety. Code available at:\nhttps://github.com/omron-sinicx/CompACT\n","authors":["Tatsuya Kamijo","Cristian C. Beltran-Hernandez","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2406.14990v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2409.17546v1","updated":"2024-09-26T05:25:25Z","published":"2024-09-26T05:25:25Z","title":"MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven\n Tiered Structure","summary":" In this paper, we develop a novel mobility-aware transformer-driven tiered\nstructure (MASSFormer) based cooperative spectrum sensing method that\neffectively models the spatio-temporal dynamics of user movements. Unlike\nexisting methods, our method considers a dynamic scenario involving mobile\nprimary users (PUs) and secondary users (SUs)and addresses the complexities\nintroduced by user mobility. The transformer architecture utilizes an attention\nmechanism, enabling the proposed method to adeptly model the temporal dynamics\nof user mobility by effectively capturing long-range dependencies within the\ninput data. The proposed method first computes tokens from the sequence of\ncovariance matrices (CMs) for each SU and processes them in parallel using the\nSUtransformer network to learn the spatio-temporal features at SUlevel.\nSubsequently, the collaborative transformer network learns the group-level PU\nstate from all SU-level feature representations. The attention-based sequence\npooling method followed by the transformer encoder adjusts the contributions of\nall tokens. The main goal of predicting the PU states at each SU-level and\ngroup-level is to improve detection performance even more. We conducted a\nsufficient amount of simulations and compared the detection performance of\ndifferent SS methods. The proposed method is tested under imperfect reporting\nchannel scenarios to show robustness. The efficacy of our method is validated\nwith the simulation results demonstrating its higher performance compared with\nexisting methods in terms of detection probability, sensing error, and\nclassification accuracy.\n","authors":["Dimpal Janu","Sandeep Mandia","Kuldeep Singh","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.17546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17545v1","updated":"2024-09-26T05:24:14Z","published":"2024-09-26T05:24:14Z","title":"Modulated Intervention Preference Optimization (MIPO): Keey the Easy,\n Refine the Difficult","summary":" Preference optimization methods typically begin training with a well-trained\nSFT model as a reference model. In RLHF and DPO, a regularization term is used\nduring the preference optimization process to prevent the policy model from\ndeviating too far from the reference model's distribution, thereby avoiding the\ngeneration of anomalous responses. When the reference model is already\nwell-aligned with the given data or only requires slight adjustments, this\napproach can produce a well-aligned model. However, if the reference model is\nnot aligned with the given data and requires significant deviation from its\ncurrent state, a regularization term may actually hinder the model alignment.\nIn this study, we propose \\textbf{Modulated Intervention Preference\nOptimization (MIPO)} to address this issue. MIPO modulates the degree of\nintervention from the reference model based on how well the given data is\naligned with it. If the data is well-aligned, the intervention is increased to\nprevent the policy model from diverging significantly from reference model.\nConversely, if the alignment is poor, the interference is reduced to facilitate\nmore extensive training. We compare the performance of MIPO and DPO using\nMistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental\nresults demonstrate that MIPO consistently outperforms DPO across various\nevaluation scenarios.\n","authors":["Cheolhun Jang"],"pdf_url":"https://arxiv.org/pdf/2409.17545v1.pdf","comment":"8pages, submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.17544v1","updated":"2024-09-26T05:22:16Z","published":"2024-09-26T05:22:16Z","title":"Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings","summary":" Theoretical and empirical evidence suggests that joint graph embedding\nalgorithms induce correlation across the networks in the embedding space. In\nthe Omnibus joint graph embedding framework, previous results explicitly\ndelineated the dual effects of the algorithm-induced and model-inherent\ncorrelations on the correlation across the embedded networks. Accounting for\nand mitigating the algorithm-induced correlation is key to subsequent\ninference, as sub-optimal Omnibus matrix constructions have been demonstrated\nto lead to loss in inference fidelity. This work presents the first efforts to\nautomate the Omnibus construction in order to address two key questions in this\njoint embedding framework: the correlation-to-OMNI problem and the flat\ncorrelation problem. In the flat correlation problem, we seek to understand the\nminimum algorithm-induced flat correlation (i.e., the same across all graph\npairs) produced by a generalized Omnibus embedding. Working in a subspace of\nthe fully general Omnibus matrices, we prove both a lower bound for this flat\ncorrelation and that the classical Omnibus construction induces the maximal\nflat correlation. In the correlation-to-OMNI problem, we present an algorithm\n-- named corr2Omni -- that, from a given matrix of estimated pairwise graph\ncorrelations, estimates the matrix of generalized Omnibus weights that induces\noptimal correlation in the embedding space. Moreover, in both simulated and\nreal data settings, we demonstrate the increased effectiveness of our corr2Omni\nalgorithm versus the classical Omnibus construction.\n","authors":["Konstantinos Pantazis","Michael Trosset","William N. Frost","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2409.17544v1.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.17538v1","updated":"2024-09-26T04:56:49Z","published":"2024-09-26T04:56:49Z","title":"On the Implicit Relation Between Low-Rank Adaptation and Differential\n Privacy","summary":" A significant approach in natural language processing involves large-scale\npre-training on general domain data followed by adaptation to specific tasks or\ndomains. As models grow in size, full fine-tuning all parameters becomes\nincreasingly impractical. To address this, some methods for low-rank task\nadaptation of language models have been proposed, e.g. LoRA and FLoRA. These\nmethods keep the pre-trained model weights fixed and incorporate trainable\nlow-rank decomposition matrices into some layers of the transformer\narchitecture, called adapters. This approach significantly reduces the number\nof trainable parameters required for downstream tasks compared to full\nfine-tuning all parameters. In this work, we look at low-rank adaptation from\nthe lens of data privacy. We show theoretically that the low-rank adaptation\nused in LoRA and FLoRA is equivalent to injecting some random noise into the\nbatch gradients w.r.t the adapter parameters coming from their full\nfine-tuning, and we quantify the variance of the injected noise. By\nestablishing a Berry-Esseen type bound on the total variation distance between\nthe noise distribution and a Gaussian distribution with the same variance, we\nshow that the dynamics of LoRA and FLoRA are very close to differentially\nprivate full fine-tuning the adapters, which suggests that low-rank adaptation\nimplicitly provides privacy w.r.t the fine-tuning data. Finally, using\nJohnson-Lindenstrauss lemma, we show that when augmented with gradient\nclipping, low-rank adaptation is almost equivalent to differentially private\nfull fine-tuning adapters with a fixed noise scale.\n","authors":["Saber Malekmohammadi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2409.17538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13689v3","updated":"2024-09-26T04:54:43Z","published":"2024-08-24T23:20:38Z","title":"Decentralised Variational Inference Frameworks for Multi-object Tracking\n on Sensor Network","summary":" This paper tackles the challenge of multi-sensor multi-object tracking by\nproposing various decentralised Variational Inference (VI) schemes that match\nthe tracking performance of centralised sensor fusion with only local message\nexchanges among neighboring sensors. We first establish a centralised VI sensor\nfusion scheme as a benchmark and analyse the limitations of its decentralised\ncounterpart, which requires sensors to await consensus at each VI iteration.\nTherefore, we propose a decentralised gradient-based VI framework that\noptimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the\nstandard ELBO, which reduces the parameter search space and enables faster\nconvergence, making it particularly beneficial for decentralised tracking.This\nproposed framework is inherently self-evolving, improving with advancements in\ndecentralised optimisation techniques for convergence guarantees and\nefficiency. Further, we enhance the convergence speed of proposed decentralised\nschemes using natural gradients and gradient tracking strategies. Results\nverify that our decentralised VI schemes are empirically equivalent to\ncentralised fusion in tracking performance. Notably, the decentralised natural\ngradient VI method is the most communication-efficient, with communication\ncosts comparable to suboptimal decentralised strategies while delivering\nnotably higher tracking accuracy.\n","authors":["Qing Li","Runze Gan","Simon Godsill"],"pdf_url":"https://arxiv.org/pdf/2408.13689v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15734v2","updated":"2024-09-26T04:37:48Z","published":"2024-09-24T04:39:47Z","title":"Trust-Region Sequential Quadratic Programming for Stochastic\n Optimization with Random Models","summary":" In this work, we consider solving optimization problems with a stochastic\nobjective and deterministic equality constraints. We propose a Trust-Region\nSequential Quadratic Programming method to find both first- and second-order\nstationary points. Our method utilizes a random model to represent the\nobjective function, which is constructed from stochastic observations of the\nobjective and is designed to satisfy proper adaptive accuracy conditions with a\nhigh but fixed probability. To converge to first-order stationary points, our\nmethod computes a gradient step in each iteration defined by minimizing a\nquadratic approximation of the objective subject to a (relaxed) linear\napproximation of the problem constraints and a trust-region constraint. To\nconverge to second-order stationary points, our method additionally computes an\neigen step to explore the negative curvature of the reduced Hessian matrix, as\nwell as a second-order correction step to address the potential Maratos effect,\nwhich arises due to the nonlinearity of the problem constraints. Such an effect\nmay impede the method from moving away from saddle points. Both gradient and\neigen step computations leverage a novel parameter-free decomposition of the\nstep and the trust-region radius, accounting for the proportions among the\nfeasibility residual, optimality residual, and negative curvature. We establish\nglobal almost sure first- and second-order convergence guarantees for our\nmethod, and present computational results on CUTEst problems, regression\nproblems, and saddle-point problems to demonstrate its superiority over\nexisting line-search-based stochastic methods.\n","authors":["Yuchen Fang","Sen Na","Michael W. Mahoney","Mladen Kolar"],"pdf_url":"https://arxiv.org/pdf/2409.15734v2.pdf","comment":"41 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.05316v3","updated":"2024-09-26T03:54:15Z","published":"2024-06-08T01:32:44Z","title":"CMamba: Channel Correlation Enhanced State Space Models for Multivariate\n Time Series Forecasting","summary":" Recent advancements in multivariate time series forecasting have been\npropelled by Linear-based, Transformer-based, and Convolution-based models,\nwith Transformer-based architectures gaining prominence for their efficacy in\ntemporal and cross-channel mixing. More recently, Mamba, a state space model,\nhas emerged with robust sequence and feature mixing capabilities. However, the\nsuitability of the vanilla Mamba design for time series forecasting remains an\nopen question, particularly due to its inadequate handling of cross-channel\ndependencies. Capturing cross-channel dependencies is critical in enhancing the\nperformance of multivariate time series prediction. Recent findings show that\nself-attention excels in capturing cross-channel dependencies, whereas other\nsimpler mechanisms, such as MLP, may degrade model performance. This is\ncounterintuitive, as MLP, being a learnable architecture, should theoretically\ncapture both correlations and irrelevances, potentially leading to neutral or\nimproved performance. Diving into the self-attention mechanism, we attribute\nthe observed degradation in MLP performance to its lack of data dependence and\nglobal receptive field, which result in MLP's lack of generalization ability.\nBased on the above insights, we introduce a refined Mamba variant tailored for\ntime series forecasting. Our proposed model, \\textbf{CMamba}, incorporates a\nmodified Mamba (M-Mamba) module for temporal dependencies modeling, a global\ndata-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies,\nand a Channel Mixup mechanism to mitigate overfitting. Comprehensive\nexperiments conducted on seven real-world datasets demonstrate the efficacy of\nour model in improving forecasting performance.\n","authors":["Chaolv Zeng","Zhanyu Liu","Guanjie Zheng","Linghe Kong"],"pdf_url":"https://arxiv.org/pdf/2406.05316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17517v1","updated":"2024-09-26T03:52:41Z","published":"2024-09-26T03:52:41Z","title":"Dataset Distillation-based Hybrid Federated Learning on Non-IID Data","summary":" In federated learning, the heterogeneity of client data has a great impact on\nthe performance of model training. Many heterogeneity issues in this process\nare raised by non-independently and identically distributed (Non-IID) data.\nThis study focuses on the issue of label distribution skew. To address it, we\npropose a hybrid federated learning framework called HFLDD, which integrates\ndataset distillation to generate approximately independent and equally\ndistributed (IID) data, thereby improving the performance of model training.\nParticularly, we partition the clients into heterogeneous clusters, where the\ndata labels among different clients within a cluster are unbalanced while the\ndata labels among different clusters are balanced. The cluster headers collect\ndistilled data from the corresponding cluster members, and conduct model\ntraining in collaboration with the server. This training process is like\ntraditional federated learning on IID data, and hence effectively alleviates\nthe impact of Non-IID data on model training. Furthermore, we compare our\nproposed method with typical baseline methods on public datasets. Experimental\nresults demonstrate that when the data labels are severely imbalanced, the\nproposed HFLDD outperforms the baseline methods in terms of both test accuracy\nand communication cost.\n","authors":["Xiufang Shi","Wei Zhang","Mincheng Wu","Guangyi Liu","Zhenyu Wen","Shibo He","Tejal Shah","Rajiv Ranjan"],"pdf_url":"https://arxiv.org/pdf/2409.17517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11337v3","updated":"2024-09-26T03:51:19Z","published":"2023-02-18T07:40:03Z","title":"Bayesian Matrix Decomposition and Applications","summary":" The sole aim of this book is to give a self-contained introduction to\nconcepts and mathematical tools in Bayesian matrix decomposition in order to\nseamlessly introduce matrix decomposition techniques and their applications in\nsubsequent sections. However, we clearly realize our inability to cover all the\nuseful and interesting results concerning Bayesian matrix decomposition and\ngiven the paucity of scope to present this discussion, e.g., the separated\nanalysis of variational inference for conducting the optimization. We refer the\nreader to literature in the field of Bayesian analysis for a more detailed\nintroduction to the related fields.\n This book is primarily a summary of purpose, significance of important\nBayesian matrix decomposition methods, e.g., real-valued decomposition,\nnonnegative matrix factorization, Bayesian interpolative decomposition, and the\norigin and complexity of the methods which shed light on their applications.\nThe mathematical prerequisite is a first course in statistics and linear\nalgebra. Other than this modest background, the development is self-contained,\nwith rigorous proof provided throughout.\n","authors":["Jun Lu"],"pdf_url":"https://arxiv.org/pdf/2302.11337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17516v1","updated":"2024-09-26T03:50:55Z","published":"2024-09-26T03:50:55Z","title":"Functional Classification of Spiking Signal Data Using Artificial\n Intelligence Techniques: A Review","summary":" Human brain neuron activities are incredibly significant nowadays. Neuronal\nbehavior is assessed by analyzing signal data such as electroencephalography\n(EEG), which can offer scientists valuable information about diseases and\nhuman-computer interaction. One of the difficulties researchers confront while\nevaluating these signals is the existence of large volumes of spike data.\nSpikes are some considerable parts of signal data that can happen as a\nconsequence of vital biomarkers or physical issues such as electrode movements.\nHence, distinguishing types of spikes is important. From this spot, the spike\nclassification concept commences. Previously, researchers classified spikes\nmanually. The manual classification was not precise enough as it involves\nextensive analysis. Consequently, Artificial Intelligence (AI) was introduced\ninto neuroscience to assist clinicians in classifying spikes correctly. This\nreview discusses the importance and use of AI in spike classification, focusing\non the recognition of neural activity noises. The task is divided into three\nmain components: preprocessing, classification, and evaluation. Existing\nmethods are introduced and their importance is determined. The review also\nhighlights the need for more efficient algorithms. The primary goal is to\nprovide a perspective on spike classification for future research and provide a\ncomprehensive understanding of the methodologies and issues involved. The\nreview organizes materials in the spike classification field for future\nstudies. In this work, numerous studies were extracted from different\ndatabases. The PRISMA-related research guidelines were then used to choose\npapers. Then, research studies based on spike classification using machine\nlearning and deep learning approaches with effective preprocessing were\nselected.\n","authors":["Danial Sharifrazi","Nouman Javed","Javad Hassannataj Joloudari","Roohallah Alizadehsani","Prasad N. Paradkar","Ru-San Tan","U. Rajendra Acharya","Asim Bhatti"],"pdf_url":"https://arxiv.org/pdf/2409.17516v1.pdf","comment":"8 figures, 32 pages"},{"id":"http://arxiv.org/abs/2409.17513v1","updated":"2024-09-26T03:48:47Z","published":"2024-09-26T03:48:47Z","title":"Comparing Unidirectional, Bidirectional, and Word2vec Models for\n Discovering Vulnerabilities in Compiled Lifted Code","summary":" Ransomware and other forms of malware cause significant financial and\noperational damage to organizations by exploiting long-standing and often\ndifficult-to-detect software vulnerabilities. To detect vulnerabilities such as\nbuffer overflows in compiled code, this research investigates the application\nof unidirectional transformer-based embeddings, specifically GPT-2. Using a\ndataset of LLVM functions, we trained a GPT-2 model to generate embeddings,\nwhich were subsequently used to build LSTM neural networks to differentiate\nbetween vulnerable and non-vulnerable code. Our study reveals that embeddings\nfrom the GPT-2 model significantly outperform those from bidirectional models\nof BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%.\nLSTM neural networks were developed with both frozen and unfrozen embedding\nmodel layers. The model with the highest performance was achieved when the\nembedding layers were unfrozen. Further, the research finds that, in exploring\nthe impact of different optimizers within this domain, the SGD optimizer\ndemonstrates superior performance over Adam. Overall, these findings reveal\nimportant insights into the potential of unidirectional transformer-based\napproaches in enhancing cybersecurity defenses.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2409.17513v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.17510v1","updated":"2024-09-26T03:40:12Z","published":"2024-09-26T03:40:12Z","title":"NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human\n Connectomes","summary":" Although modern imaging technologies allow us to study connectivity between\ntwo distinct brain regions in-vivo, an in-depth understanding of how anatomical\nstructure supports brain function and how spontaneous functional fluctuations\nemerge remarkable cognition is still elusive. Meanwhile, tremendous efforts\nhave been made in the realm of machine learning to establish the nonlinear\nmapping between neuroimaging data and phenotypic traits. However, the absence\nof neuroscience insight in the current approaches poses significant challenges\nin understanding cognitive behavior from transient neural activities. To\naddress this challenge, we put the spotlight on the coupling mechanism of\nstructural connectivity (SC) and functional connectivity (FC) by formulating\nsuch network neuroscience question into an expressive graph representation\nlearning problem for high-order topology. Specifically, we introduce the\nconcept of topological detour to characterize how a ubiquitous instance of FC\n(direct link) is supported by neural pathways (detour) physically wired by SC,\nwhich forms a cyclic loop interacted by brain structure and function. In the\nclich\\'e of machine learning, the multi-hop detour pathway underlying SC-FC\ncoupling allows us to devise a novel multi-head self-attention mechanism within\nTransformer to capture multi-modal feature representation from paired graphs of\nSC and FC. Taken together, we propose a biological-inspired deep model, coined\nas NeuroPath, to find putative connectomic feature representations from the\nunprecedented amount of neuroimages, which can be plugged into various\ndownstream applications such as task recognition and disease diagnosis. We have\nevaluated NeuroPath on large-scale public datasets including HCP and UK Biobank\nunder supervised and zero-shot learning, where the state-of-the-art performance\nby our NeuroPath indicates great potential in network neuroscience.\n","authors":["Ziquan Wei","Tingting Dan","Jiaqi Ding","Paul J Laurienti","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17510v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17508v1","updated":"2024-09-26T03:33:26Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization, recent advances primarily focus on improving the LLM\ncomponents, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector. Extensive ablation experiments\nvalidate the effectiveness of introducing CMoE under any configuration, with up\nto an average 8% performance gains. We further provide interpretation analysis\nof the tug-of-war problem from the perspective of gradient optimization and\nparameter statistics. Compared to previous state-of-the-art medical MLLMs,\nUni-Med achieves competitive or superior evaluation metrics on diverse tasks.\nCode, data and model will be soon available at GitHub.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17505v1","updated":"2024-09-26T03:24:59Z","published":"2024-09-26T03:24:59Z","title":"Sequential Kernelized Stein Discrepancy","summary":" We present a sequential version of the kernelized Stein discrepancy, which\nallows for conducting goodness-of-fit tests for unnormalized densities that are\ncontinuously monitored and adaptively stopped. That is, the sample size need\nnot be fixed prior to data collection; the practitioner can choose whether to\nstop the test or continue to gather evidence at any time while controlling the\nfalse discovery rate. In stark contrast to related literature, we do not impose\nuniform boundedness on the Stein kernel. Instead, we exploit the potential\nboundedness of the Stein kernel at arbitrary point evaluations to define test\nmartingales, that give way to the subsequent novel sequential tests. We prove\nthe validity of the test, as well as an asymptotic lower bound for the\nlogarithmic growth of the wealth process under the alternative. We further\nillustrate the empirical performance of the test with a variety of\ndistributions, including restricted Boltzmann machines.\n","authors":["Diego Martinez-Taboada","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2409.17505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17504v1","updated":"2024-09-26T03:22:09Z","published":"2024-09-26T03:22:09Z","title":"HaloScope: Harnessing Unlabeled LLM Generations for Hallucination\n Detection","summary":" The surge in applications of large language models (LLMs) has prompted\nconcerns about the generation of misleading or fabricated information, known as\nhallucinations. Therefore, detecting hallucinations has become critical to\nmaintaining trust in LLM-generated content. A primary challenge in learning a\ntruthfulness classifier is the lack of a large amount of labeled truthful and\nhallucinated data. To address the challenge, we introduce HaloScope, a novel\nlearning framework that leverages the unlabeled LLM generations in the wild for\nhallucination detection. Such unlabeled data arises freely upon deploying LLMs\nin the open world, and consists of both truthful and hallucinated information.\nTo harness the unlabeled data, we present an automated membership estimation\nscore for distinguishing between truthful and untruthful generations within\nunlabeled mixture data, thereby enabling the training of a binary truthfulness\nclassifier on top. Importantly, our framework does not require extra data\ncollection and human annotations, offering strong flexibility and practicality\nfor real-world applications. Extensive experiments show that HaloScope can\nachieve superior hallucination detection performance, outperforming the\ncompetitive rivals by a significant margin. Code is available at\nhttps://github.com/deeplearningwisc/haloscope.\n","authors":["Xuefeng Du","Chaowei Xiao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.17504v1.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2409.17502v1","updated":"2024-09-26T03:20:09Z","published":"2024-09-26T03:20:09Z","title":"Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond","summary":" We propose a new operator defined between two tensors, the broadcast product.\nThe broadcast product calculates the Hadamard product after duplicating\nelements to align the shapes of the two tensors. Complex tensor operations in\nlibraries like \\texttt{numpy} can be succinctly represented as mathematical\nexpressions using the broadcast product. Finally, we propose a novel tensor\ndecomposition using the broadcast product, highlighting its potential\napplications in dimensionality reduction.\n","authors":["Yusuke Matsui","Tatsuya Yokota"],"pdf_url":"https://arxiv.org/pdf/2409.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17499v1","updated":"2024-09-26T03:12:20Z","published":"2024-09-26T03:12:20Z","title":"Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in\n Unified Distributed SGD","summary":" Distributed learning is essential to train machine learning algorithms across\nheterogeneous agents while maintaining data privacy. We conduct an asymptotic\nanalysis of Unified Distributed SGD (UD-SGD), exploring a variety of\ncommunication patterns, including decentralized SGD and local SGD within\nFederated Learning (FL), as well as the increasing communication interval in\nthe FL setting. In this study, we assess how different sampling strategies,\nsuch as i.i.d. sampling, shuffling, and Markovian sampling, affect the\nconvergence speed of UD-SGD by considering the impact of agent dynamics on the\nlimiting covariance matrix as described in the Central Limit Theorem (CLT). Our\nfindings not only support existing theories on linear speedup and asymptotic\nnetwork independence, but also theoretically and empirically show how efficient\nsampling strategies employed by individual agents contribute to overall\nconvergence in UD-SGD. Simulations reveal that a few agents using highly\nefficient sampling can achieve or surpass the performance of the majority\nemploying moderately improved strategies, providing new insights beyond\ntraditional analyses focusing on the worst-performing agent.\n","authors":["Jie Hu","Yi-Ting Ma","Do Young Eun"],"pdf_url":"https://arxiv.org/pdf/2409.17499v1.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.14578v4","updated":"2024-09-26T02:59:44Z","published":"2024-05-23T13:52:36Z","title":"Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling","summary":" In current deep learning tasks, Adam style optimizers such as Adam, Adagrad,\nRMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style\noptimizers. These optimizers typically update model parameters using the sign\nof gradients, resulting in more stable convergence curves. The learning rate\nand the batch size are the most critical hyperparameters for optimizers, which\nrequire careful tuning to enable effective convergence. Previous research has\nshown that the optimal learning rate increases linearly or follows similar\nrules with batch size for SGD style optimizers. However, this conclusion is not\napplicable to Adam style optimizers. In this paper, we elucidate the connection\nbetween optimal learning rates and batch sizes for Adam style optimizers\nthrough both theoretical analysis and extensive experiments. First, we raise\nthe scaling law between batch sizes and optimal learning rates in the sign of\ngradient case, in which we prove that the optimal learning rate first rises and\nthen falls as the batch size increases. Moreover, the peak value of the surge\nwill gradually move toward the larger batch size as training progresses.\nSecond, we conducted experiments on various CV and NLP tasks and verified the\ncorrectness of the scaling law.\n","authors":["Shuaipeng Li","Penghao Zhao","Hailin Zhang","Xingwu Sun","Hao Wu","Dian Jiao","Weiyan Wang","Chengjun Liu","Zheng Fang","Jinbao Xue","Yangyu Tao","Bin Cui","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16663v2","updated":"2024-09-26T02:57:52Z","published":"2024-09-25T06:48:25Z","title":"Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles\n Using Latent Space Generative World Models","summary":" We propose the use of latent space generative world models to address the\ncovariate shift problem in autonomous driving. A world model is a neural\nnetwork capable of predicting an agent's next state given past states and\nactions. By leveraging a world model during training, the driving policy\neffectively mitigates covariate shift without requiring an excessive amount of\ntraining data. During end-to-end training, our policy learns how to recover\nfrom errors by aligning with states observed in human demonstrations, so that\nat runtime it can recover from perturbations outside the training distribution.\nAdditionally, we introduce a novel transformer-based perception encoder that\nemploys multi-view cross-attention and a learned scene query. We present\nqualitative and quantitative results, demonstrating significant improvements\nupon prior state of the art in closed-loop testing in the CARLA simulator, as\nwell as showing the ability to handle perturbations in both CARLA and NVIDIA's\nDRIVE Sim.\n","authors":["Alexander Popov","Alperen Degirmenci","David Wehr","Shashank Hegde","Ryan Oldja","Alexey Kamenev","Bertrand Douillard","David Nistér","Urs Muller","Ruchi Bhargava","Stan Birchfield","Nikolai Smolyanskiy"],"pdf_url":"https://arxiv.org/pdf/2409.16663v2.pdf","comment":"7 pages, 6 figures, for ICRA 2025 conference, for associated video\n file, see https://youtu.be/fO7RZ57gVxk"},{"id":"http://arxiv.org/abs/2409.17490v1","updated":"2024-09-26T02:54:19Z","published":"2024-09-26T02:54:19Z","title":"MathDSL: A Domain-Specific Language for Concise Mathematical Solutions\n Via Program Synthesis","summary":" We present MathDSL, a Domain-Specific Language (DSL) for mathematical\nequation solving, which, when deployed in program synthesis models, outperforms\nstate-of-the-art reinforcement-learning-based methods. We also introduce a\nquantitative metric for measuring the conciseness of a mathematical solution\nand demonstrate the improvement in the quality of generated solutions compared\nto other methods. Our system demonstrates that a program synthesis system\n(DreamCoder) using MathDSL can generate programs that solve linear equations\nwith greater accuracy and conciseness than using reinforcement learning\nsystems. Additionally, we demonstrate that if we use the action spaces of\nprevious reinforcement learning systems as DSLs, MathDSL outperforms the\naction-space-DSLs. We use DreamCoder to store equation-solving strategies as\nlearned abstractions in its program library and demonstrate that by using\nMathDSL, these can be converted into human-interpretable solution strategies\nthat could have applications in mathematical education.\n","authors":["Sagnik Anupam","Maddy Bowers","Omar Costilla-Reyes","Armando Solar-Lezama"],"pdf_url":"https://arxiv.org/pdf/2409.17490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04585v2","updated":"2024-09-26T02:53:15Z","published":"2024-01-09T14:42:49Z","title":"EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization\n of Diffusion Models","summary":" Diffusion models have achieved great success in image generation tasks\nthrough iterative noise estimation. However, the heavy denoising process and\ncomplex neural networks hinder their low-latency applications in real-world\nscenarios. Quantization can effectively reduce model complexity, and\npost-training quantization (PTQ), which does not require fine-tuning, is highly\npromising for compressing and accelerating diffusion models. Unfortunately, we\nfind that due to the highly dynamic distribution of activations in different\ndenoising steps, existing PTQ methods for diffusion models suffer from\ndistribution mismatch issues at both calibration sample level and\nreconstruction output level, which makes the performance far from satisfactory,\nespecially in low-bit cases. In this paper, we propose Enhanced Distribution\nAlignment for Post-Training Quantization of Diffusion Models (EDA-DM) to\naddress the above issues. Specifically, at the calibration sample level, we\nselect calibration samples based on the density and variety in the latent\nspace, thus facilitating the alignment of their distribution with the overall\nsamples; and at the reconstruction output level, we modify the loss of block\nreconstruction with the losses of layers, aligning the outputs of quantized\nmodel and full-precision model at different network granularity. Extensive\nexperiments demonstrate that EDA-DM significantly outperforms the existing PTQ\nmethods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and\ndifferent datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO).\n","authors":["Xuewen Liu","Zhikai Li","Junrui Xiao","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2401.04585v2.pdf","comment":"Code: http://github.com/BienLuky/EDA-DM"}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.04591v4","updated":"2024-09-26T16:35:58Z","published":"2023-11-08T10:45:09Z","title":"Exploring Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nAlthough traditional cameras are commonly applied, their reliability decreases\nin scenarios under high dynamic range or heavy motion blur, where event cameras\noffer a robust solution. Predominant event-based methods accumulate events into\nframes, ignoring the asynchronous and high temporal resolution that is crucial\nfor distinguishing distinct actions. To address this issue and to unlock the 3D\npotential of event information, we introduce two 3D event representations: the\nRasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The\nRasEPC aggregates events within concise temporal slices at identical positions,\npreserving their 3D attributes along with statistical information, thereby\nsignificantly reducing memory and computational demands. Meanwhile, the DEV\nrepresentation discretizes events into voxels and projects them across three\northogonal planes, utilizing decoupled event attention to retrieve 3D cues from\nthe 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic\nevent-based dataset crafted to facilitate training and quantitative analysis in\noutdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD\ndataset, and our EV-3DPW dataset, with further qualitative validation via a\nderived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our\ncode and dataset have been made publicly available at\nhttps://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v4.pdf","comment":"Accepted to Computer Vision and Image Understanding (CVPU). Extended\n version of arXiv:2206.04511. The code and dataset are available at\n https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2409.17899v1","updated":"2024-09-26T14:49:09Z","published":"2024-09-26T14:49:09Z","title":"Revisiting Acoustic Similarity in Emotional Speech and Music via\n Self-Supervised Representations","summary":" Emotion recognition from speech and music shares similarities due to their\nacoustic overlap, which has led to interest in transferring knowledge between\nthese domains. However, the shared acoustic cues between speech and music,\nparticularly those encoded by Self-Supervised Learning (SSL) models, remain\nlargely unexplored, given the fact that SSL models for speech and music have\nrarely been applied in cross-domain research. In this work, we revisit the\nacoustic similarity between emotion speech and music, starting with an analysis\nof the layerwise behavior of SSL models for Speech Emotion Recognition (SER)\nand Music Emotion Recognition (MER). Furthermore, we perform cross-domain\nadaptation by comparing several approaches in a two-stage fine-tuning process,\nexamining effective ways to utilize music for SER and speech for MER. Lastly,\nwe explore the acoustic similarities between emotional speech and music using\nFrechet audio distance for individual emotions, uncovering the issue of emotion\nbias in both speech and music SSL models. Our findings reveal that while speech\nand music SSL models do capture shared acoustic features, their behaviors can\nvary depending on different emotions due to their training strategies and\ndomain-specificities. Additionally, parameter-efficient fine-tuning can enhance\nSER and MER performance by leveraging knowledge from each other. This study\nprovides new insights into the acoustic similarity between emotional speech and\nmusic, and highlights the potential for cross-domain generalization to improve\nSER and MER systems.\n","authors":["Yujia Sun","Zeyu Zhao","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.17899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17864v1","updated":"2024-09-26T14:12:23Z","published":"2024-09-26T14:12:23Z","title":"A Multimodal Single-Branch Embedding Network for Recommendation in\n Cold-Start and Missing Modality Scenarios","summary":" Most recommender systems adopt collaborative filtering (CF) and provide\nrecommendations based on past collective interactions. Therefore, the\nperformance of CF algorithms degrades when few or no interactions are\navailable, a scenario referred to as cold-start. To address this issue,\nprevious work relies on models leveraging both collaborative data and side\ninformation on the users or items. Similar to multimodal learning, these models\naim at combining collaborative and content representations in a shared\nembedding space. In this work we propose a novel technique for multimodal\nrecommendation, relying on a multimodal Single-Branch embedding network for\nRecommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction\ndata as well as multimodal side information using the same single-branch\nembedding network on different modalities. This makes SiBraR effective in\nscenarios of missing modality, including cold start. Our extensive experiments\non large-scale recommendation datasets from three different recommendation\ndomains (music, movie, and e-commerce) and providing multimodal content\ninformation (audio, text, image, labels, and interactions) show that SiBraR\nsignificantly outperforms CF as well as state-of-the-art content-based RSs in\ncold-start scenarios, and is competitive in warm scenarios. We show that\nSiBraR's recommendations are accurate in missing modality scenarios, and that\nthe model is able to map different modalities to the same region of the shared\nembedding space, hence reducing the modality gap.\n","authors":["Christian Ganhör","Marta Moscati","Anna Hausberger","Shah Nawaz","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2409.17864v1.pdf","comment":"Accepted at 18th ACM Conference on Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2409.17678v1","updated":"2024-09-26T09:37:04Z","published":"2024-09-26T09:37:04Z","title":"Modeling the Popularity of Events on Web by Sparsity and\n Mutual-Excitation Guided Graph Neural Network","summary":" The content of a webpage described or posted an event in the cyberspace\ninevitably reflects viewpoints, values and trends of the physical society.\nMapping an event on web to the popularity score plays a pivot role to sense the\nsocial trends from the cyberspace. However, the complex semantic correspondence\nbetween texts and images, as well as the implicit text-image-popularity mapping\nmechanics pose a significant challenge to this non-trivial task. In this paper,\nwe address this problem from a viewpoint of understanding the interpretable\nmapping mechanics. Concretely, we organize the keywords from different events\ninto an unified graph. The unified graph facilitates to model the popularity of\nevents via two-level mappings, i.e., the self excitation and the mutual\nexcitation. The self-excitation assumes that each keyword forms the popularity\nwhile the mutual-excitation models that two keywords would excite each other to\ndetermine the popularity of an event. Specifically, we use Graph Neural Network\n(GNN) as the backbone to model the self-excitation, the mutual excitation and\nthe context of images into a sparse and deep factor model. Besides, to our best\nknowledge, we release a challenge web event dataset for the popularity\nprediction task. The experimental results on three public datasets demonstrate\nthat our method achieves significant improvements and outperforms the\nstate-of-the-art methods. Dataset is publicly available at:\nhttps://github.com/pangjunbiao/Hot-events-dataset.\n","authors":["Jiaxin Deng","Linlin Jia","Junbiao Pang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17596v1","updated":"2024-09-26T07:22:38Z","published":"2024-09-26T07:22:38Z","title":"Subjective and Objective Quality-of-Experience Evaluation Study for Live\n Video Streaming","summary":" In recent years, live video streaming has gained widespread popularity across\nvarious social media platforms. Quality of experience (QoE), which reflects\nend-users' satisfaction and overall experience, plays a critical role for media\nservice providers to optimize large-scale live compression and transmission\nstrategies to achieve perceptually optimal rate-distortion trade-off. Although\nmany QoE metrics for video-on-demand (VoD) have been proposed, there remain\nsignificant challenges in developing QoE metrics for live video streaming. To\nbridge this gap, we conduct a comprehensive study of subjective and objective\nQoE evaluations for live video streaming. For the subjective QoE study, we\nintroduce the first live video streaming QoE dataset, TaoLive QoE, which\nconsists of $42$ source videos collected from real live broadcasts and $1,155$\ncorresponding distorted ones degraded due to a variety of streaming\ndistortions, including conventional streaming distortions such as compression,\nstalling, as well as live streaming-specific distortions like frame skipping,\nvariable frame rate, etc. Subsequently, a human study was conducted to derive\nsubjective QoE scores of videos in the TaoLive QoE dataset. For the objective\nQoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well\nas publicly available QoE datasets for VoD scenarios, highlighting that current\nmodels struggle to accurately assess video QoE, particularly for live content.\nHence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates\nmulti-scale semantic features and optical flow-based motion features to\npredicting a retrospective QoE score, eliminating reliance on statistical\nquality of service (QoS) features.\n","authors":["Zehao Zhu","Wei Sun","Jun Jia","Wei Wu","Sibin Deng","Kai Li","Ying Chen","Xiongkuo Min","Jia Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.17596v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17550v1","updated":"2024-09-26T05:39:52Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v1.pdf","comment":"The source code will be released soon"},{"id":"http://arxiv.org/abs/2408.00970v2","updated":"2024-09-26T03:56:00Z","published":"2024-08-02T01:30:18Z","title":"Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning\n for Emotion Recognition in Conversation","summary":" Multimodal emotion recognition in conversation (MERC) seeks to identify the\nspeakers' emotions expressed in each utterance, offering significant potential\nacross diverse fields. The challenge of MERC lies in balancing speaker modeling\nand context modeling, encompassing both long-distance and short-distance\ncontexts, as well as addressing the complexity of multimodal information\nfusion. Recent research adopts graph-based methods to model intricate\nconversational relationships effectively. Nevertheless, the majority of these\nmethods utilize a fixed fully connected structure to link all utterances,\nrelying on convolution to interpret complex context. This approach can\ninherently heighten the redundancy in contextual messages and excessive graph\nnetwork smoothing, particularly in the context of long-distance conversations.\nTo address this issue, we propose a framework that dynamically adjusts\nhypergraph connections by variational hypergraph autoencoder (VHGAE), and\nemploys contrastive learning to mitigate uncertainty factors during the\nreconstruction process. Experimental results demonstrate the effectiveness of\nour proposal against the state-of-the-art methods on IEMOCAP and MELD datasets.\nWe release the code to support the reproducibility of this work at\nhttps://github.com/yzjred/-HAUCL.\n","authors":["Zijian Yi","Ziming Zhao","Zhishu Shen","Tiehua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.00970v2.pdf","comment":"Accepted by ACM MULTIMEDIA 2024"},{"id":"http://arxiv.org/abs/2404.09245v2","updated":"2024-09-26T01:25:22Z","published":"2024-04-14T13:14:13Z","title":"Arena: A Patch-of-Interest ViT Inference Acceleration System for\n Edge-Assisted Video Analytics","summary":" The advent of edge computing has made real-time intelligent video analytics\nfeasible. Previous works, based on traditional model architecture (e.g., CNN,\nRNN, etc.), employ various strategies to filter out non-region-of-interest\ncontent to minimize bandwidth and computation consumption but show inferior\nperformance in adverse environments. Recently, visual foundation models based\non transformers have shown great performance in adverse environments due to\ntheir amazing generalization capability. However, they require a large amount\nof computation power, which limits their applications in real-time intelligent\nvideo analytics. In this paper, we find visual foundation models like Vision\nTransformer (ViT) also have a dedicated acceleration mechanism for video\nanalytics. To this end, we introduce Arena, an end-to-end edge-assisted video\ninference acceleration system based on ViT. We leverage the capability of ViT\nthat can be accelerated through token pruning by only offloading and feeding\nPatches-of-Interest to the downstream models. Additionally, we design an\nadaptive keyframe inference switching algorithm tailored to different videos,\ncapable of adapting to the current video content to jointly optimize accuracy\nand bandwidth. Through extensive experiments, our findings reveal that Arena\ncan boost inference speeds by up to 1.58\\(\\times\\) and 1.82\\(\\times\\) on\naverage while consuming only 47\\% and 31\\% of the bandwidth, respectively, all\nwith high inference accuracy.\n","authors":["Haosong Peng","Wei Feng","Hao Li","Yufeng Zhan","Ren Jin","Yuanqing Xia"],"pdf_url":"https://arxiv.org/pdf/2404.09245v2.pdf","comment":null}]},"2024-09-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.17433v1","updated":"2024-09-25T23:52:17Z","published":"2024-09-25T23:52:17Z","title":"HDFlow: Enhancing LLM Complex Problem-Solving with Hybrid Thinking and\n Dynamic Workflows","summary":" Despite recent advancements in large language models (LLMs), their\nperformance on complex reasoning problems requiring multi-step thinking and\ncombining various skills is still limited. To address this, we propose a novel\nframework HDFlow for complex reasoning with LLMs that combines fast and slow\nthinking modes in an adaptive manner. Our approach consists of two key\ncomponents: 1) a new approach for slow, deliberate reasoning called Dynamic\nWorkflow, which automatically decomposes complex problems into more manageable\nsub-tasks and dynamically designs a workflow to assemble specialized LLM or\nsymbolic reasoning tools to solve sub-tasks; 2) Hybrid Thinking, a general\nframework that dynamically combines fast and slow thinking based on problem\ncomplexity. Finally, we propose an easy-to-scale method for automatically\nsynthesizing a large-scale dataset of 27K challenging reasoning problems for\ncomplex reasoning and a hybrid thinking tuning method that trains smaller LLMs\non this dataset to internalize the fast/slow hybrid reasoning strategies.\nExperiments on four reasoning benchmark datasets demonstrate that our slow\nthinking with dynamic workflows significantly outperforms Chain-of-Thought, and\nhybrid thinking achieves the highest accuracy while providing an effective\nbalance between computational efficiency and performance. Fine-tuning using our\nhybrid thinking approach also significantly boosts the complex reasoning\ncapabilities of open-source language models. The results showcase the promise\nof slow thinking, dynamic workflows, and hybrid thinking in expanding the\nfrontier of complex problem-solving with LLMs\\footnote{Code and data will be\nreleased at \\url{https://github.com/wenlinyao/HDFlow}.}.\n","authors":["Wenlin Yao","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2409.17433v1.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.17431v1","updated":"2024-09-25T23:38:15Z","published":"2024-09-25T23:38:15Z","title":"On Extending Direct Preference Optimization to Accommodate Ties","summary":" We derive and investigate two DPO variants that explicitly model the\npossibility of declaring a tie in pair-wise comparisons. We replace the\nBradley-Terry model in DPO with two well-known modeling extensions, by Rao and\nKupper and by Davidson, that assign probability to ties as alternatives to\nclear preferences. Our experiments in neural machine translation and\nsummarization show that explicitly labeled ties can be added to the datasets\nfor these DPO variants without the degradation in task performance that is\nobserved when the same tied pairs are presented to DPO. We find empirically\nthat the inclusion of ties leads to stronger regularization with respect to the\nreference policy as measured by KL divergence, and we see this even for DPO in\nits original form. These findings motivate and enable the inclusion of tied\npairs in preference optimization as opposed to simply discarding them.\n","authors":["Jinghong Chen","Guangyu Yang","Weizhe Lin","Jingbiao Mei","Bill Byrne"],"pdf_url":"https://arxiv.org/pdf/2409.17431v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2409.17422v1","updated":"2024-09-25T23:14:47Z","published":"2024-09-25T23:14:47Z","title":"Discovering the Gems in Early Layers: Accelerating Long-Context LLMs\n with 1000x Input Token Reduction","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nhandling long context inputs, but this comes at the cost of increased\ncomputational resources and latency. Our research introduces a novel approach\nfor the long context bottleneck to accelerate LLM inference and reduce GPU\nmemory consumption. Our research demonstrates that LLMs can identify relevant\ntokens in the early layers before generating answers to a query. Leveraging\nthis insight, we propose an algorithm that uses early layers of an LLM as\nfilters to select and compress input tokens, significantly reducing the context\nlength for subsequent processing. Our method, GemFilter, demonstrates\nsubstantial improvements in both speed and memory efficiency compared to\nexisting techniques, such as standard attention and SnapKV/H2O. Notably, it\nachieves a 2.4$\\times$ speedup and 30\\% reduction in GPU memory usage compared\nto SOTA methods. Evaluation on the Needle in a Haystack task shows that\nGemFilter significantly outperforms standard attention, SnapKV and demonstrates\ncomparable performance on the LongBench challenge. GemFilter is simple,\ntraining-free, and broadly applicable across different LLMs. Crucially, it\nprovides interpretability by allowing humans to inspect the selected input\nsequence. These findings not only offer practical benefits for LLM deployment,\nbut also enhance our understanding of LLM internal mechanisms, paving the way\nfor further optimizations in LLM design and inference. Our code is available at\n\\url{https://github.com/SalesforceAIResearch/GemFilter}.\n","authors":["Zhenmei Shi","Yifei Ming","Xuan-Phi Nguyen","Yingyu Liang","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2409.17422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17419v1","updated":"2024-09-25T23:06:55Z","published":"2024-09-25T23:06:55Z","title":"Pre-Finetuning with Impact Duration Awareness for Stock Movement\n Prediction","summary":" Understanding the duration of news events' impact on the stock market is\ncrucial for effective time-series forecasting, yet this facet is largely\noverlooked in current research. This paper addresses this research gap by\nintroducing a novel dataset, the Impact Duration Estimation Dataset (IDED),\nspecifically designed to estimate impact duration based on investor opinions.\nOur research establishes that pre-finetuning language models with IDED can\nenhance performance in text-based stock movement predictions. In addition, we\njuxtapose our proposed pre-finetuning task with sentiment analysis\npre-finetuning, further affirming the significance of learning impact duration.\nOur findings highlight the promise of this novel research direction in stock\nmovement prediction, offering a new avenue for financial forecasting. We also\nprovide the IDED and pre-finetuned language models under the CC BY-NC-SA 4.0\nlicense for academic use, fostering further exploration in this field.\n","authors":["Chr-Jr Chiu","Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17419v1.pdf","comment":"NTCIR-18 FinArg-2 Dataset"},{"id":"http://arxiv.org/abs/2409.17417v1","updated":"2024-09-25T23:00:20Z","published":"2024-09-25T23:00:20Z","title":"Enhancing Investment Opinion Ranking through Argument-Based Sentiment\n Analysis","summary":" In the era of rapid Internet and social media platform development,\nindividuals readily share their viewpoints online. The overwhelming quantity of\nthese posts renders comprehensive analysis impractical. This necessitates an\nefficient recommendation system to filter and present significant, relevant\nopinions. Our research introduces a dual-pronged argument mining technique to\nimprove recommendation system effectiveness, considering both professional and\namateur investor perspectives. Our first strategy involves using the\ndiscrepancy between target and closing prices as an opinion indicator. The\nsecond strategy applies argument mining principles to score investors'\nopinions, subsequently ranking them by these scores. Experimental results\nconfirm the effectiveness of our approach, demonstrating its ability to\nidentify opinions with higher profit potential. Beyond profitability, our\nresearch extends to risk analysis, examining the relationship between\nrecommended opinions and investor behaviors. This offers a holistic view of\npotential outcomes following the adoption of these recommended opinions.\n","authors":["Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2409.17417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17416v1","updated":"2024-09-25T22:57:29Z","published":"2024-09-25T22:57:29Z","title":"From Deception to Detection: The Dual Roles of Large Language Models in\n Fake News","summary":" Fake news poses a significant threat to the integrity of information\necosystems and public trust. The advent of Large Language Models (LLMs) holds\nconsiderable promise for transforming the battle against fake news. Generally,\nLLMs represent a double-edged sword in this struggle. One major concern is that\nLLMs can be readily used to craft and disseminate misleading information on a\nlarge scale. This raises the pressing questions: Can LLMs easily generate\nbiased fake news? Do all LLMs have this capability? Conversely, LLMs offer\nvaluable prospects for countering fake news, thanks to their extensive\nknowledge of the world and robust reasoning capabilities. This leads to other\ncritical inquiries: Can we use LLMs to detect fake news, and do they outperform\ntypical detection models? In this paper, we aim to address these pivotal\nquestions by exploring the performance of various LLMs. Our objective is to\nexplore the capability of various LLMs in effectively combating fake news,\nmarking this as the first investigation to analyze seven such models. Our\nresults reveal that while some models adhere strictly to safety protocols,\nrefusing to generate biased or misleading content, other models can readily\nproduce fake news across a spectrum of biases. Additionally, our results show\nthat larger models generally exhibit superior detection abilities and that\nLLM-generated fake news are less likely to be detected than human-written ones.\nFinally, our findings demonstrate that users can benefit from LLM-generated\nexplanations in identifying fake news.\n","authors":["Dorsaf Sallami","Yuan-Chen Chang","Esma Aïmeur"],"pdf_url":"https://arxiv.org/pdf/2409.17416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17407v1","updated":"2024-09-25T22:30:42Z","published":"2024-09-25T22:30:42Z","title":"Post-hoc Reward Calibration: A Case Study on Length Bias","summary":" Reinforcement Learning from Human Feedback aligns the outputs of Large\nLanguage Models with human values and preferences. Central to this process is\nthe reward model (RM), which translates human feedback into training signals\nfor optimising LLM behaviour. However, RMs can develop biases by exploiting\nspurious correlations in their training data, such as favouring outputs based\non length or style rather than true quality. These biases can lead to incorrect\noutput rankings, sub-optimal model evaluations, and the amplification of\nundesirable behaviours in LLMs alignment. This paper addresses the challenge of\ncorrecting such biases without additional data and training, introducing the\nconcept of Post-hoc Reward Calibration. We first propose an intuitive approach\nto estimate the bias term and, thus, remove it to approximate the underlying\ntrue reward. We then extend the approach to a more general and robust form with\nthe Locally Weighted Regression. Focusing on the prevalent length bias, we\nvalidate our proposed approaches across three experimental settings,\ndemonstrating consistent improvements: (1) a 3.11 average performance gain\nacross 33 reward models on the RewardBench dataset; (2) enhanced alignment of\nRM rankings with GPT-4 evaluations and human preferences based on the\nAlpacaEval benchmark; and (3) improved Length-Controlled win rate of the RLHF\nprocess in multiple LLM--RM combinations. Our method is computationally\nefficient and generalisable to other types of bias and RMs, offering a scalable\nand robust solution for mitigating biases in LLM alignment. Our code and\nresults are available at https://github.com/ZeroYuHuang/Reward-Calibration.\n","authors":["Zeyu Huang","Zihan Qiu","Zili Wang","Edoardo M. Ponti","Ivan Titov"],"pdf_url":"https://arxiv.org/pdf/2409.17407v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.13221v2","updated":"2024-09-25T22:28:06Z","published":"2024-09-20T05:15:38Z","title":"RLHFuse: Efficient RLHF Training for Large Language Models with Inter-\n and Intra-Stage Fusion","summary":" Reinforcement Learning from Human Feedback (RLHF) enhances the alignment\nbetween LLMs and human preference. The workflow of RLHF typically involves\nseveral models and tasks in a series of distinct stages. Existing RLHF training\nsystems view each task as the smallest execution unit thus overlooking the\nopportunities for subtask-level optimizations. Due to the intrinsic nature of\nRLHF training, i.e., the data skewness in the generation stage, and the\npipeline bubbles in the training stage, existing RLHF systems suffer from low\nGPU utilization in production deployments.\n RLHFuse breaks the traditional view of RLHF workflow as a composition of\nindividual tasks, splitting each task into finer-grained subtasks, and\nperforming stage fusion to improve GPU utilization. RLHFuse contains two key\nideas. First, for generation and inference tasks, RLHFuse splits them into\nsample-level subtasks, enabling efficient inter-stage fusion to mitigate the\noriginal generation bottleneck dominated by long-tailed samples. Second, for\ntraining tasks, RLHFuse breaks them into subtasks of micro-batches. By\nleveraging the intuition that pipeline execution can be essentially\ncomplemented by another pipeline, RLHFuse performs intra-stage fusion to\nconcurrently execute these subtasks in the training stage with a fused pipeline\nschedule, resulting in fewer pipeline bubbles. In addition, RLHFuse\nincorporates a series of system optimizations tailored for each stage of RLHF,\nmaking it efficient and scalable for our internal product usage. We evaluate\nRLHFuse on various popular LLMs and the results show that RLHFuse increases the\ntraining throughput by up to 3.7x, compared to existing state-of-the-art\nsystems.\n","authors":["Yinmin Zhong","Zili Zhang","Bingyang Wu","Shengyu Liu","Yukun Chen","Changyi Wan","Hanpeng Hu","Lei Xia","Ranchen Ming","Yibo Zhu","Xin Jin"],"pdf_url":"https://arxiv.org/pdf/2409.13221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17397v1","updated":"2024-09-25T22:14:34Z","published":"2024-09-25T22:14:34Z","title":"Severity Prediction in Mental Health: LLM-based Creation, Analysis,\n Evaluation of a Novel Multilingual Dataset","summary":" Large Language Models (LLMs) are increasingly integrated into various medical\nfields, including mental health support systems. However, there is a gap in\nresearch regarding the effectiveness of LLMs in non-English mental health\nsupport applications. To address this problem, we present a novel multilingual\nadaptation of widely-used mental health datasets, translated from English into\nsix languages (Greek, Turkish, French, Portuguese, German, and Finnish). This\ndataset enables a comprehensive evaluation of LLM performance in detecting\nmental health conditions and assessing their severity across multiple\nlanguages. By experimenting with GPT and Llama, we observe considerable\nvariability in performance across languages, despite being evaluated on the\nsame translated dataset. This inconsistency underscores the complexities\ninherent in multilingual mental health support, where language-specific nuances\nand mental health data coverage can affect the accuracy of the models. Through\ncomprehensive error analysis, we emphasize the risks of relying exclusively on\nlarge language models (LLMs) in medical settings (e.g., their potential to\ncontribute to misdiagnoses). Moreover, our proposed approach offers significant\ncost savings for multilingual tasks, presenting a major advantage for\nbroad-scale implementation.\n","authors":["Konstantinos Skianis","John Pavlopoulos","A. Seza Doğruöz"],"pdf_url":"https://arxiv.org/pdf/2409.17397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17391v1","updated":"2024-09-25T22:08:31Z","published":"2024-09-25T22:08:31Z","title":"Scaling Behavior for Large Language Models regarding Numeral Systems: An\n Example using Pythia","summary":" Though Large Language Models (LLMs) have shown remarkable abilities in\nmathematics reasoning, they are still struggling with performing numeric\noperations accurately, such as addition and multiplication. Numbers can be\ntokenized into tokens in various ways by different LLMs and affect the numeric\noperations performance. Currently, there are two representatives: 1) Tokenize\ninto $1$-digit, and 2) Tokenize into $1\\sim 3$ digit. The difference is roughly\nequivalent to using different numeral systems (namely base $10$ or base\n$10^{3}$). In light of this, we study the scaling behavior of different numeral\nsystems in the context of transformer-based large language models. We\nempirically show that a base $10$ system is consistently more data-efficient\nthan a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes\nunder from-scratch training settings, while different number systems have very\nsimilar fine-tuning performances. We attribute this to higher token frequencies\nof a base $10$ system. Additionally, we reveal extrapolation behavior patterns\non addition and multiplication. We identify that base $100$ and base $1000$\nsystems struggle on token-level discernment and token-level operations. We also\nsheds light on the mechanism learnt by the models.\n","authors":["Zhejian Zhou","Jiayu Wang","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2409.17391v1.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17373v1","updated":"2024-09-25T21:32:57Z","published":"2024-09-25T21:32:57Z","title":"data2lang2vec: Data Driven Typological Features Completion","summary":" Language typology databases enhance multi-lingual Natural Language Processing\n(NLP) by improving model adaptability to diverse linguistic structures. The\nwidely-used lang2vec toolkit integrates several such databases, but its\ncoverage remains limited at 28.9\\%. Previous work on automatically increasing\ncoverage predicts missing values based on features from other languages or\nfocuses on single features, we propose to use textual data for better-informed\nfeature prediction. To this end, we introduce a multi-lingual Part-of-Speech\n(POS) tagger, achieving over 70\\% accuracy across 1,749 languages, and\nexperiment with external statistical features and a variety of machine learning\nalgorithms. We also introduce a more realistic evaluation setup, focusing on\nlikely to be missing typology features, and show that our approach outperforms\nprevious work in both setups.\n","authors":["Hamidreza Amirzadeh","Sadegh Jafari","Anika Harju","Rob van der Goot"],"pdf_url":"https://arxiv.org/pdf/2409.17373v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.17353v1","updated":"2024-09-25T20:59:12Z","published":"2024-09-25T20:59:12Z","title":"Internalizing ASR with Implicit Chain of Thought for Efficient\n Speech-to-Speech Conversational LLM","summary":" Current speech-based LLMs are predominantly trained on extensive ASR and TTS\ndatasets, excelling in tasks related to these domains. However, their ability\nto handle direct speech-to-speech conversations remains notably constrained.\nThese models often rely on an ASR-to-TTS chain-of-thought pipeline, converting\nspeech into text for processing before generating audio responses, which\nintroduces latency and loses audio features. We propose a method that\nimplicitly internalizes ASR chain of thought into a speech LLM, enhancing its\nnative speech understanding capabilities. Our approach reduces latency and\nimproves the model's native understanding of speech, paving the way for more\nefficient and natural real-time audio interactions. We also release a\nlarge-scale synthetic conversational dataset to facilitate further research.\n","authors":["Robin Shing-Hei Yuen","Timothy Tin-Long Tse","Jian Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.17353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05840v3","updated":"2024-09-25T20:50:00Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v3.pdf","comment":"Make the last little additions to the draft"},{"id":"http://arxiv.org/abs/2409.17326v1","updated":"2024-09-25T20:05:45Z","published":"2024-09-25T20:05:45Z","title":"How Transliterations Improve Crosslingual Alignment","summary":" Recent studies have shown that post-aligning multilingual pretrained language\nmodels (mPLMs) using alignment objectives on both original and transliterated\ndata can improve crosslingual alignment. This improvement further leads to\nbetter crosslingual transfer performance. However, it remains unclear how and\nwhy a better crosslingual alignment is achieved, as this technique only\ninvolves transliterations, and does not use any parallel data. This paper\nattempts to explicitly evaluate the crosslingual alignment and identify the key\nelements in transliteration-based approaches that contribute to better\nperformance. For this, we train multiple models under varying setups for two\npairs of related languages: (1) Polish and Ukrainian and (2) Hindi and Urdu. To\nassess alignment, we define four types of similarities based on sentence\nrepresentations. Our experiments show that adding transliterations alone\nimproves the overall similarities, even for random sentence pairs. With the\nhelp of auxiliary alignment objectives, especially the contrastive objective,\nthe model learns to distinguish matched from random pairs, leading to better\nalignments. However, we also show that better alignment does not always yield\nbetter downstream performance, suggesting that further research is needed to\nclarify the connection between alignment and performance.\n","authors":["Yihong Liu","Mingyang Wang","Amir Hossein Kargaran","Ayyoob Imani","Orgest Xhelili","Haotian Ye","Chunlan Ma","François Yvon","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2409.17326v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.17313v1","updated":"2024-09-25T19:49:39Z","published":"2024-09-25T19:49:39Z","title":"Navigating the Nuances: A Fine-grained Evaluation of Vision-Language\n Navigation","summary":" This study presents a novel evaluation framework for the Vision-Language\nNavigation (VLN) task. It aims to diagnose current models for various\ninstruction categories at a finer-grained level. The framework is structured\naround the context-free grammar (CFG) of the task. The CFG serves as the basis\nfor the problem decomposition and the core premise of the instruction\ncategories design. We propose a semi-automatic method for CFG construction with\nthe help of Large-Language Models (LLMs). Then, we induct and generate data\nspanning five principal instruction categories (i.e. direction change, landmark\nrecognition, region recognition, vertical movement, and numerical\ncomprehension). Our analysis of different models reveals notable performance\ndiscrepancies and recurrent issues. The stagnation of numerical comprehension,\nheavy selective biases over directional concepts, and other interesting\nfindings contribute to the development of future language-guided navigation\nsystems.\n","authors":["Zehao Wang","Minye Wu","Yixin Cao","Yubo Ma","Meiqi Chen","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2409.17313v1.pdf","comment":"EMNLP 2024 Findings; project page:\n https://zehao-wang.github.io/navnuances"},{"id":"http://arxiv.org/abs/2409.17312v1","updated":"2024-09-25T19:46:49Z","published":"2024-09-25T19:46:49Z","title":"BabyLlama-2: Ensemble-Distilled Models Consistently Outperform Teachers\n With Limited Data","summary":" We present BabyLlama-2, a 345 million parameter model distillation-pretrained\nfrom two teachers on a 10 million word corpus for the BabyLM competition. On\nBLiMP and SuperGLUE benchmarks, BabyLlama-2 outperforms baselines trained on\nboth 10 and 100 million word datasets with the same data mix, as well as its\nteacher models. Through an extensive hyperparameter sweep, we demonstrate that\nthe advantages of distillation cannot be attributed to suboptimal\nhyperparameter selection of the teachers. Our findings underscore the need for\nfurther investigation into distillation techniques, particularly in\ndata-limited settings.\n","authors":["Jean-Loup Tastet","Inar Timiryasov"],"pdf_url":"https://arxiv.org/pdf/2409.17312v1.pdf","comment":"9 pages, 3 figures, 5 tables, submitted to the BabyLM Challenge\n (CoNLL 2024 Shared Task)"},{"id":"http://arxiv.org/abs/2404.18923v3","updated":"2024-09-25T19:18:16Z","published":"2024-04-29T17:58:36Z","title":"Holmes: A Benchmark to Assess the Linguistic Competence of Language\n Models","summary":" We introduce Holmes, a new benchmark designed to assess language models (LMs)\nlinguistic competence - their unconscious understanding of linguistic\nphenomena. Specifically, we use classifier-based probing to examine LMs'\ninternal representations regarding distinct linguistic phenomena (e.g.,\npart-of-speech tagging). As a result, we meet recent calls to disentangle LMs'\nlinguistic competence from other cognitive abilities, such as following\ninstructions in prompting-based evaluations. Composing Holmes, we review over\n270 probing studies and include more than 200 datasets to assess syntax,\nmorphology, semantics, reasoning, and discourse phenomena. Analyzing over 50\nLMs reveals that, aligned with known trends, their linguistic competence\ncorrelates with model size. However, surprisingly, model architecture and\ninstruction tuning also significantly influence performance, particularly in\nmorphology and syntax. Finally, we propose FlashHolmes, a streamlined version\nthat reduces the computation load while maintaining high-ranking precision.\n","authors":["Andreas Waldis","Yotam Perlitz","Leshem Choshen","Yufang Hou","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2404.18923v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10147v2","updated":"2024-09-25T19:16:16Z","published":"2024-08-19T16:47:46Z","title":"In-Context Learning with Representations: Contextual Generalization of\n Trained Transformers","summary":" In-context learning (ICL) refers to a remarkable capability of pretrained\nlarge language models, which can learn a new task given a few examples during\ninference. However, theoretical understanding of ICL is largely under-explored,\nparticularly whether transformers can be trained to generalize to unseen\nexamples in a prompt, which will require the model to acquire contextual\nknowledge of the prompt for generalization. This paper investigates the\ntraining dynamics of transformers by gradient descent through the lens of\nnon-linear regression tasks. The contextual generalization here can be attained\nvia learning the template function for each task in-context, where all template\nfunctions lie in a linear space with $m$ basis functions. We analyze the\ntraining dynamics of one-layer multi-head transformers to in-contextly predict\nunlabeled inputs given partially labeled prompts, where the labels contain\nGaussian noise and the number of examples in each prompt are not sufficient to\ndetermine the template. Under mild assumptions, we show that the training loss\nfor a one-layer multi-head transformer converges linearly to a global minimum.\nMoreover, the transformer effectively learns to perform ridge regression over\nthe basis functions. To our knowledge, this study is the first provable\ndemonstration that transformers can learn contextual (i.e., template)\ninformation to generalize to both unseen examples and tasks when prompts\ncontain only a small number of query-answer pairs.\n","authors":["Tong Yang","Yu Huang","Yingbin Liang","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2408.10147v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.12327v2","updated":"2024-09-25T19:11:20Z","published":"2024-07-17T05:53:20Z","title":"Spectra: A Comprehensive Study of Ternary, Quantized, and FP16 Language\n Models","summary":" Post-training quantization is the leading method for addressing\nmemory-related bottlenecks in LLM inference, but unfortunately, it suffers from\nsignificant performance degradation below 4-bit precision. An alternative\napproach involves training compressed models directly at a low bitwidth (e.g.,\nbinary or ternary models). However, the performance, training dynamics, and\nscaling trends of such models are not yet well understood. To address this\nissue, we train and openly release the Spectra LLM suite consisting of 54\nlanguage models ranging from 99M to 3.9B parameters, trained on 300B tokens.\nSpectra includes FloatLMs, post-training quantized QuantLMs (3, 4, 6, and 8\nbits), and ternary LLMs (TriLMs) - our improved architecture for ternary\nlanguage modeling, which significantly outperforms previously proposed ternary\nmodels of a given size (in bits), matching half-precision models at scale. For\nexample, TriLM 3.9B is (bit-wise) smaller than the half-precision FloatLM 830M,\nbut matches half-precision FloatLM 3.9B in commonsense reasoning and knowledge\nbenchmarks. However, TriLM 3.9B is also as toxic and stereotyping as FloatLM\n3.9B, a model six times larger in size. Additionally, TriLM 3.9B lags behind\nFloatLM in perplexity on validation splits and web-based corpora but performs\nbetter on less noisy datasets like Lambada and PennTreeBank.\n To enhance understanding of low-bitwidth models, we are releasing 500+\nintermediate checkpoints of the Spectra suite at\n\\href{https://github.com/NolanoOrg/SpectraSuite}{https://github.com/NolanoOrg/SpectraSuite}.\n","authors":["Ayush Kaushal","Tejas Vaidhya","Tejas Pandey","Aaryan Bhagat","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2407.12327v2.pdf","comment":"32 pages, 12 figures, and 10 tables"},{"id":"http://arxiv.org/abs/2409.15567v2","updated":"2024-09-25T18:50:52Z","published":"2024-09-23T21:48:32Z","title":"Asking an AI for salary negotiation advice is a matter of concern:\n Controlled experimental perturbation of ChatGPT for protected and\n non-protected group discrimination on a contextual task with no clear ground\n truth answers","summary":" We conducted controlled experimental bias audits for four versions of\nChatGPT, which we asked to recommend an opening offer in salary negotiations\nfor a new hire. We submitted 98,800 prompts to each version, systematically\nvarying the employee's gender, university, and major, and tested prompts in\nvoice of each side of the negotiation: the employee versus employer. We find\nChatGPT as a multi-model platform is not robust and consistent enough to be\ntrusted for such a task. We observed statistically significant salary offers\nwhen varying gender for all four models, although with smaller gaps than for\nother attributes tested. The largest gaps were different model versions and\nbetween the employee- vs employer-voiced prompts. We also observed substantial\ngaps when varying university and major, but many of the biases were not\nconsistent across model versions. We tested for fictional and fraudulent\nuniversities and found wildly inconsistent results across cases and model\nversions. We make broader contributions to the AI/ML fairness literature. Our\nscenario and our experimental design differ from mainstream AI/ML auditing\nefforts in key ways. Bias audits typically test discrimination for protected\nclasses like gender, which we contrast with testing non-protected classes of\nuniversity and major. Asking for negotiation advice includes how aggressive one\nought to be in a negotiation relative to known empirical salary distributions\nand scales, which is a deeply contextual and personalized task that has no\nobjective ground truth to validate. These results raise concerns for the\nspecific model versions we tested and ChatGPT as a multi-model platform in\ncontinuous development. Our epistemology does not permit us to definitively\ncertify these models as either generally biased or unbiased on the attributes\nwe test, but our study raises matters of concern for stakeholders to further\ninvestigate.\n","authors":["R. Stuart Geiger","Flynn O'Sullivan","Elsie Wang","Jonathan Lo"],"pdf_url":"https://arxiv.org/pdf/2409.15567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17270v1","updated":"2024-09-25T18:35:45Z","published":"2024-09-25T18:35:45Z","title":"Proof of Thought : Neurosymbolic Program Synthesis allows Robust and\n Interpretable Reasoning","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nyet they struggle with inconsistent reasoning, particularly in novel domains\nand complex logical sequences. This research introduces Proof of Thought, a\nframework that enhances the reliability and transparency of LLM outputs. Our\napproach bridges LLM-generated ideas with formal logic verification, employing\na custom interpreter to convert LLM outputs into First Order Logic constructs\nfor theorem prover scrutiny. Central to our method is an intermediary\nJSON-based Domain-Specific Language, which by design balances precise logical\nstructures with intuitive human concepts. This hybrid representation enables\nboth rigorous validation and accessible human comprehension of LLM reasoning\nprocesses. Key contributions include a robust type system with sort management\nfor enhanced logical integrity, explicit representation of rules for clear\ndistinction between factual and inferential knowledge, and a flexible\narchitecture that allows for easy extension to various domain-specific\napplications. We demonstrate Proof of Thought's effectiveness through\nbenchmarking on StrategyQA and a novel multimodal reasoning task, showing\nimproved performance in open-ended scenarios. By providing verifiable and\ninterpretable results, our technique addresses critical needs for AI system\naccountability and sets a foundation for human-in-the-loop oversight in\nhigh-stakes domains.\n","authors":["Debargha Ganguly","Srinivasan Iyengar","Vipin Chaudhary","Shivkumar Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2409.17270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09580v3","updated":"2024-09-25T18:30:28Z","published":"2023-11-16T05:31:21Z","title":"MMoE: Enhancing Multimodal Models with Mixtures of Multimodal\n Interaction Experts","summary":" Advances in multimodal models have greatly improved how interactions relevant\nto various tasks are modeled. Today's multimodal models mainly focus on the\ncorrespondence between images and text, using this for tasks like image-text\nmatching. However, this covers only a subset of real-world interactions. Novel\ninteractions, such as sarcasm expressed through opposing spoken words and\ngestures or humor expressed through utterances and tone of voice, remain\nchallenging. In this paper, we introduce an approach to enhance multimodal\nmodels, which we call Multimodal Mixtures of Experts (MMoE). The key idea in\nMMoE is to train separate expert models for each type of multimodal\ninteraction, such as redundancy present in both modalities, uniqueness in one\nmodality, or synergy that emerges when both modalities are fused. On a sarcasm\ndetection task (MUStARD) and a humor detection task (URFUNNY), we obtain new\nstate-of-the-art results. MMoE is also able to be applied to various types of\nmodels to gain improvement.\n","authors":["Haofei Yu","Zhengyang Qi","Lawrence Jang","Ruslan Salakhutdinov","Louis-Philippe Morency","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2311.09580v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17146v1","updated":"2024-09-25T17:59:51Z","published":"2024-09-25T17:59:51Z","title":"Molmo and PixMo: Open Weights and Open Data for State-of-the-Art\n Multimodal Models","summary":" Today's most advanced multimodal models remain proprietary. The strongest\nopen-weight models rely heavily on synthetic data from proprietary VLMs to\nachieve good performance, effectively distilling these closed models into open\nones. As a result, the community is still missing foundational knowledge about\nhow to build performant VLMs from scratch. We present Molmo, a new family of\nVLMs that are state-of-the-art in their class of openness. Our key innovation\nis a novel, highly detailed image caption dataset collected entirely from human\nannotators using speech-based descriptions. To enable a wide array of user\ninteractions, we also introduce a diverse dataset mixture for fine-tuning that\nincludes in-the-wild Q&A and innovative 2D pointing data. The success of our\napproach relies on careful choices for the model architecture details, a\nwell-tuned training pipeline, and, most critically, the quality of our newly\ncollected datasets, all of which will be released. The best-in-class 72B model\nwithin the Molmo family not only outperforms others in the class of open weight\nand data models but also compares favorably against proprietary systems like\nGPT-4o, Claude 3.5, and Gemini 1.5 on both academic benchmarks and human\nevaluation.\n We will be releasing all of our model weights, captioning and fine-tuning\ndata, and source code in the near future. Select model weights, inference code,\nand demo are available at https://molmo.allenai.org.\n","authors":["Matt Deitke","Christopher Clark","Sangho Lee","Rohun Tripathi","Yue Yang","Jae Sung Park","Mohammadreza Salehi","Niklas Muennighoff","Kyle Lo","Luca Soldaini","Jiasen Lu","Taira Anderson","Erin Bransom","Kiana Ehsani","Huong Ngo","YenSung Chen","Ajay Patel","Mark Yatskar","Chris Callison-Burch","Andrew Head","Rose Hendrix","Favyen Bastani","Eli VanderBilt","Nathan Lambert","Yvonne Chou","Arnavi Chheda","Jenna Sparks","Sam Skjonsberg","Michael Schmitz","Aaron Sarnat","Byron Bischoff","Pete Walsh","Chris Newell","Piper Wolters","Tanmay Gupta","Kuo-Hao Zeng","Jon Borchardt","Dirk Groeneveld","Jen Dumas","Crystal Nam","Sophie Lebrecht","Caitlin Wittlif","Carissa Schoenick","Oscar Michel","Ranjay Krishna","Luca Weihs","Noah A. Smith","Hannaneh Hajishirzi","Ross Girshick","Ali Farhadi","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2409.17146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17141v1","updated":"2024-09-25T17:58:35Z","published":"2024-09-25T17:58:35Z","title":"FineZip : Pushing the Limits of Large Language Models for Practical\n Lossless Text Compression","summary":" While the language modeling objective has been shown to be deeply connected\nwith compression, it is surprising that modern LLMs are not employed in\npractical text compression systems. In this paper, we provide an in-depth\nanalysis of neural network and transformer-based compression techniques to\nanswer this question. We compare traditional text compression systems with\nneural network and LLM-based text compression methods. Although LLM-based\nsystems significantly outperform conventional compression methods, they are\nhighly impractical. Specifically, LLMZip, a recent text compression system\nusing Llama3-8B requires 9.5 days to compress just 10 MB of text, although with\nhuge improvements in compression ratios. To overcome this, we present FineZip -\na novel LLM-based text compression system that combines ideas of online\nmemorization and dynamic context to reduce the compression time immensely.\nFineZip can compress the above corpus in approximately 4 hours compared to 9.5\ndays, a 54 times improvement over LLMZip and comparable performance. FineZip\noutperforms traditional algorithmic compression methods with a large margin,\nimproving compression ratios by approximately 50\\%. With this work, we take the\nfirst step towards making lossless text compression with LLMs a reality. While\nFineZip presents a significant step in that direction, LLMs are still not a\nviable solution for large-scale text compression. We hope our work paves the\nway for future research and innovation to solve this problem.\n","authors":["Fazal Mittu","Yihuan Bu","Akshat Gupta","Ashok Devireddy","Alp Eren Ozdarendeli","Anant Singh","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2409.17141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16201v2","updated":"2024-09-25T17:58:21Z","published":"2023-11-27T07:19:26Z","title":"Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image\n Generation","summary":" Recent advances in image tokenizers, such as VQ-VAE, have enabled\ntext-to-image generation using auto-regressive methods, similar to language\nmodeling. However, these methods have yet to leverage pre-trained language\nmodels, despite their adaptability to various downstream tasks. In this work,\nwe explore this gap by adapting a pre-trained language model for\nauto-regressive text-to-image generation, and find that pre-trained language\nmodels offer limited help. We provide a two-fold explanation by analyzing\ntokens from each modality. First, we demonstrate that image tokens possess\nsignificantly different semantics compared to text tokens, rendering\npre-trained language models no more effective in modeling them than randomly\ninitialized ones. Second, the text tokens in the image-text datasets are too\nsimple compared to normal language model pre-training data, which causes the\ncatastrophic degradation of language models' capability.\n","authors":["Yuhui Zhang","Brandon McKinzie","Zhe Gan","Vaishaal Shankar","Alexander Toshev"],"pdf_url":"https://arxiv.org/pdf/2311.16201v2.pdf","comment":"Published at EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2409.17130v1","updated":"2024-09-25T17:48:59Z","published":"2024-09-25T17:48:59Z","title":"Assessing the Level of Toxicity Against Distinct Groups in Bangla Social\n Media Comments: A Comprehensive Investigation","summary":" Social media platforms have a vital role in the modern world, serving as\nconduits for communication, the exchange of ideas, and the establishment of\nnetworks. However, the misuse of these platforms through toxic comments, which\ncan range from offensive remarks to hate speech, is a concerning issue. This\nstudy focuses on identifying toxic comments in the Bengali language targeting\nthree specific groups: transgender people, indigenous people, and migrant\npeople, from multiple social media sources. The study delves into the intricate\nprocess of identifying and categorizing toxic language while considering the\nvarying degrees of toxicity: high, medium, and low. The methodology involves\ncreating a dataset, manual annotation, and employing pre-trained transformer\nmodels like Bangla-BERT, bangla-bert-base, distil-BERT, and\nBert-base-multilingual-cased for classification. Diverse assessment metrics\nsuch as accuracy, recall, precision, and F1-score are employed to evaluate the\nmodel's effectiveness. The experimental findings reveal that Bangla-BERT\nsurpasses alternative models, achieving an F1-score of 0.8903. This research\nexposes the complexity of toxicity in Bangla social media dialogues, revealing\nits differing impacts on diverse demographic groups.\n","authors":["Mukaffi Bin Moin","Pronay Debnath","Usafa Akther Rifa","Rijeet Bin Anis"],"pdf_url":"https://arxiv.org/pdf/2409.17130v1.pdf","comment":"Accepted for publication in \"18th International Conference on\n Information Technology and Applications (ICITA 2024)\""},{"id":"http://arxiv.org/abs/2409.17213v1","updated":"2024-09-25T17:38:39Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by democratic deliberation theory, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17120v1","updated":"2024-09-25T17:31:45Z","published":"2024-09-25T17:31:45Z","title":"Deep Learning and Machine Learning, Advancing Big Data Analytics and\n Management: Handy Appetizer","summary":" This book explores the role of Artificial Intelligence (AI), Machine Learning\n(ML), and Deep Learning (DL) in driving the progress of big data analytics and\nmanagement. The book focuses on simplifying the complex mathematical concepts\nbehind deep learning, offering intuitive visualizations and practical case\nstudies to help readers understand how neural networks and technologies like\nConvolutional Neural Networks (CNNs) work. It introduces several classic models\nand technologies such as Transformers, GPT, ResNet, BERT, and YOLO,\nhighlighting their applications in fields like natural language processing,\nimage recognition, and autonomous driving. The book also emphasizes the\nimportance of pre-trained models and how they can enhance model performance and\naccuracy, with instructions on how to apply these models in various real-world\nscenarios. Additionally, it provides an overview of key big data management\ntechnologies like SQL and NoSQL databases, as well as distributed computing\nframeworks such as Apache Hadoop and Spark, explaining their importance in\nmanaging and processing vast amounts of data. Ultimately, the book underscores\nthe value of mastering deep learning and big data management skills as critical\ntools for the future workforce, making it an essential resource for both\nbeginners and experienced professionals.\n","authors":["Benji Peng","Xuanhe Pan","Yizhu Wen","Ziqian Bi","Keyu Chen","Ming Li","Ming Liu","Qian Niu","Junyu Liu","Jinlang Wang","Sen Zhang","Jiawei Xu","Pohsun Feng"],"pdf_url":"https://arxiv.org/pdf/2409.17120v1.pdf","comment":"This book contains 93 pages and 60 figures"},{"id":"http://arxiv.org/abs/2409.17115v1","updated":"2024-09-25T17:28:13Z","published":"2024-09-25T17:28:13Z","title":"Programming Every Example: Lifting Pre-training Data Quality like\n Experts at Scale","summary":" Large language model pre-training has traditionally relied on human experts\nto craft heuristics for improving the corpora quality, resulting in numerous\nrules developed to date. However, these rules lack the flexibility to address\nthe unique characteristics of individual example effectively. Meanwhile,\napplying tailored rules to every example is impractical for human experts. In\nthis paper, we demonstrate that even small language models, with as few as 0.3B\nparameters, can exhibit substantial data refining capabilities comparable to\nthose of human experts. We introduce Programming Every Example (ProX), a novel\nframework that treats data refinement as a programming task, enabling models to\nrefine corpora by generating and executing fine-grained operations, such as\nstring normalization, for each individual example at scale. Experimental\nresults show that models pre-trained on ProX-curated data outperform either\noriginal data or data filtered by other selection methods by more than 2%\nacross various downstream benchmarks. Its effectiveness spans various model\nsizes and pre-training corpora, including C4, RedPajama-V2, and FineWeb.\nFurthermore, ProX exhibits significant potential in domain-specific continual\npre-training: without domain specific design, models trained on OpenWebMath\nrefined by ProX outperform human-crafted rule-based methods, improving average\naccuracy by 7.6% over Mistral-7B, with 14.6% for Llama-2-7B and 20.3% for\nCodeLlama-7B, all within 10B tokens to be comparable to models like Llemma-7B\ntrained on 200B tokens. Further analysis highlights that ProX significantly\nsaves training FLOPs, offering a promising path for efficient LLM\npre-training.We are open-sourcing ProX with >100B corpus, models, and sharing\nall training and implementation details for reproducible research and future\ninnovation. Code: https://github.com/GAIR-NLP/ProX\n","authors":["Fan Zhou","Zengzhi Wang","Qian Liu","Junlong Li","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.17115v1.pdf","comment":"45 pages, 13 figures, 34 tables"},{"id":"http://arxiv.org/abs/2309.17012v3","updated":"2024-09-25T16:57:20Z","published":"2023-09-29T06:53:10Z","title":"Benchmarking Cognitive Biases in Large Language Models as Evaluators","summary":" Large Language Models are cognitively biased judges. Large Language Models\n(LLMs) have recently been shown to be effective as automatic evaluators with\nsimple prompting and in-context learning. In this work, we assemble 15 LLMs of\nfour different size ranges and evaluate their output responses by preference\nranking from the other LLMs as evaluators, such as System Star is better than\nSystem Square. We then evaluate the quality of ranking outputs introducing the\nCognitive Bias Benchmark for LLMs as Evaluators (CoBBLEr), a benchmark to\nmeasure six different cognitive biases in LLM evaluation outputs, such as the\nEgocentric bias where a model prefers to rank its own outputs highly in\nevaluation. We find that LLMs are biased text quality evaluators, exhibiting\nstrong indications on our bias benchmark (average of 40% of comparisons across\nall models) within each of their evaluations that question their robustness as\nevaluators. Furthermore, we examine the correlation between human and machine\npreferences and calculate the average Rank-Biased Overlap (RBO) score to be\n49.6%, indicating that machine preferences are misaligned with humans.\nAccording to our findings, LLMs may still be unable to be utilized for\nautomatic annotation aligned with human preferences. Our project page is at:\nhttps://minnesotanlp.github.io/cobbler.\n","authors":["Ryan Koo","Minhwa Lee","Vipul Raheja","Jong Inn Park","Zae Myung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2309.17012v3.pdf","comment":"Publishsed at ACL 2024. 29 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.17080v1","updated":"2024-09-25T16:45:02Z","published":"2024-09-25T16:45:02Z","title":"Can Vision Language Models Learn from Visual Demonstrations of Ambiguous\n Spatial Reasoning?","summary":" Large vision-language models (VLMs) have become state-of-the-art for many\ncomputer vision tasks, with in-context learning (ICL) as a popular adaptation\nstrategy for new ones. But can VLMs learn novel concepts purely from visual\ndemonstrations, or are they limited to adapting to the output format of ICL\nexamples? We propose a new benchmark we call Spatial Visual Ambiguity Tasks\n(SVAT) that challenges state-of-the-art VLMs to learn new visuospatial tasks\nin-context. We find that VLMs fail to do this zero-shot, and sometimes continue\nto fail after finetuning. However, adding simpler data to the training by\ncurriculum learning leads to improved ICL performance.\n","authors":["Bowen Zhao","Leo Parker Dirac","Paulina Varshavskaya"],"pdf_url":"https://arxiv.org/pdf/2409.17080v1.pdf","comment":"13 pages, 4 figures. Code released at\n https://github.com/groundlight/vlm-visual-demonstrations"},{"id":"http://arxiv.org/abs/2409.17073v1","updated":"2024-09-25T16:32:35Z","published":"2024-09-25T16:32:35Z","title":"Enhancing Post-Hoc Attributions in Long Document Comprehension via\n Coarse Grained Answer Decomposition","summary":" Accurately attributing answer text to its source document is crucial for\ndeveloping a reliable question-answering system. However, attribution for long\ndocuments remains largely unexplored. Post-hoc attribution systems are designed\nto map answer text back to the source document, yet the granularity of this\nmapping has not been addressed. Furthermore, a critical question arises: What\nprecisely should be attributed, with an emphasis on identifying the information\nunits within an answer that necessitate grounding? In this paper, we propose\nand investigate a novel approach to the factual decomposition of generated\nanswers for attribution, employing template-based in-context learning. To\naccomplish this, we utilize the question and integrate negative sampling during\nfew-shot in-context learning for decomposition. This approach enhances the\nsemantic understanding of both abstractive and extractive answers. We examine\nthe impact of answer decomposition by providing a thorough examination of\nvarious attribution approaches, ranging from retrieval-based techniques to\nLLM-based attributors.\n","authors":["Pritika Ramu","Koustava Goswami","Apoorv Saxena","Balaji Vasan Srinivavsan"],"pdf_url":"https://arxiv.org/pdf/2409.17073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14829v2","updated":"2024-09-25T16:27:50Z","published":"2024-06-21T02:18:03Z","title":"Is This a Bad Table? A Closer Look at the Evaluation of Table Generation\n from Text","summary":" Understanding whether a generated table is of good quality is important to be\nable to use it in creating or editing documents using automatic methods. In\nthis work, we underline that existing measures for table quality evaluation\nfail to capture the overall semantics of the tables, and sometimes unfairly\npenalize good tables and reward bad ones. We propose TabEval, a novel table\nevaluation strategy that captures table semantics by first breaking down a\ntable into a list of natural language atomic statements and then compares them\nwith ground truth statements using entailment-based measures. To validate our\napproach, we curate a dataset comprising of text descriptions for 1,250 diverse\nWikipedia tables, covering a range of topics and structures, in contrast to the\nlimited scope of existing datasets. We compare TabEval with existing metrics\nusing unsupervised and supervised text-to-table generation methods,\ndemonstrating its stronger correlation with human judgments of table quality\nacross four datasets.\n","authors":["Pritika Ramu","Aparna Garimella","Sambaran Bandyopadhyay"],"pdf_url":"https://arxiv.org/pdf/2406.14829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17054v1","updated":"2024-09-25T16:13:42Z","published":"2024-09-25T16:13:42Z","title":"Using LLM for Real-Time Transcription and Summarization of\n Doctor-Patient Interactions into ePuskesmas in Indonesia","summary":" One of the key issues contributing to inefficiency in Puskesmas is the\ntime-consuming nature of doctor-patient interactions. Doctors need to conduct\nthorough consultations, which include diagnosing the patient's condition,\nproviding treatment advice, and transcribing detailed notes into medical\nrecords. In regions with diverse linguistic backgrounds, doctors often have to\nask clarifying questions, further prolonging the process. While diagnosing is\nessential, transcription and summarization can often be automated using AI to\nimprove time efficiency and help doctors enhance care quality and enable early\ndiagnosis and intervention. This paper proposes a solution using a localized\nlarge language model (LLM) to transcribe, translate, and summarize\ndoctor-patient conversations. We utilize the Whisper model for transcription\nand GPT-3 to summarize them into the ePuskemas medical records format. This\nsystem is implemented as an add-on to an existing web browser extension,\nallowing doctors to fill out patient forms while talking. By leveraging this\nsolution for real-time transcription, translation, and summarization, doctors\ncan improve the turnaround time for patient care while enhancing the quality of\nrecords, which become more detailed and insightful for future visits. This\ninnovation addresses challenges like overcrowded facilities and the\nadministrative burden on healthcare providers in Indonesia. We believe this\nsolution will help doctors save time, provide better care, and produce more\naccurate medical records, representing a significant step toward modernizing\nhealthcare and ensuring patients receive timely, high-quality care, even in\nresource-constrained settings.\n","authors":["Azmul Asmar Irfan","Nur Ahmad Khatim","Mansur M. Arief"],"pdf_url":"https://arxiv.org/pdf/2409.17054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17046v1","updated":"2024-09-25T15:59:58Z","published":"2024-09-25T15:59:58Z","title":"Detecting Temporal Ambiguity in Questions","summary":" Detecting and answering ambiguous questions has been a challenging task in\nopen-domain question answering. Ambiguous questions have different answers\ndepending on their interpretation and can take diverse forms. Temporally\nambiguous questions are one of the most common types of such questions. In this\npaper, we introduce TEMPAMBIQA, a manually annotated temporally ambiguous QA\ndataset consisting of 8,162 open-domain questions derived from existing\ndatasets. Our annotations focus on capturing temporal ambiguity to study the\ntask of detecting temporally ambiguous questions. We propose a novel approach\nby using diverse search strategies based on disambiguated versions of the\nquestions. We also introduce and test non-search, competitive baselines for\ndetecting temporal ambiguity using zero-shot and few-shot approaches.\n","authors":["Bhawna Piryani","Abdelrahman Abdallah","Jamshid Mozafari","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2409.17046v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.17044v1","updated":"2024-09-25T15:54:29Z","published":"2024-09-25T15:54:29Z","title":"How to Connect Speech Foundation Models and Large Language Models? What\n Matters and What Does Not","summary":" The remarkable performance achieved by Large Language Models (LLM) has driven\nresearch efforts to leverage them for a wide range of tasks and input\nmodalities. In speech-to-text (S2T) tasks, the emerging solution consists of\nprojecting the output of the encoder of a Speech Foundational Model (SFM) into\nthe LLM embedding space through an adapter module. However, no work has yet\ninvestigated how much the downstream-task performance depends on each component\n(SFM, adapter, LLM) nor whether the best design of the adapter depends on the\nchosen SFM and LLM. To fill this gap, we evaluate the combination of 5 adapter\nmodules, 2 LLMs (Mistral and Llama), and 2 SFMs (Whisper and SeamlessM4T) on\ntwo widespread S2T tasks, namely Automatic Speech Recognition and Speech\nTranslation. Our results demonstrate that the SFM plays a pivotal role in\ndownstream performance, while the adapter choice has moderate impact and\ndepends on the SFM and LLM.\n","authors":["Francesco Verdini","Pierfrancesco Melucci","Stefano Perna","Francesco Cariaggi","Marco Gaido","Sara Papi","Szymon Mazurek","Marek Kasztelnik","Luisa Bentivogli","Sébastien Bratières","Paolo Merialdo","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2409.17044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14507v3","updated":"2024-09-25T15:50:51Z","published":"2024-09-22T16:11:02Z","title":"A is for Absorption: Studying Feature Splitting and Absorption in Sparse\n Autoencoders","summary":" Sparse Autoencoders (SAEs) have emerged as a promising approach to decompose\nthe activations of Large Language Models (LLMs) into human-interpretable\nlatents. In this paper, we pose two questions. First, to what extent do SAEs\nextract monosemantic and interpretable latents? Second, to what extent does\nvarying the sparsity or the size of the SAE affect monosemanticity /\ninterpretability? By investigating these questions in the context of a simple\nfirst-letter identification task where we have complete access to ground truth\nlabels for all tokens in the vocabulary, we are able to provide more detail\nthan prior investigations. Critically, we identify a problematic form of\nfeature-splitting we call feature absorption where seemingly monosemantic\nlatents fail to fire in cases where they clearly should. Our investigation\nsuggests that varying SAE size or sparsity is insufficient to solve this issue,\nand that there are deeper conceptual issues in need of resolution.\n","authors":["David Chanin","James Wilken-Smith","Tomáš Dulka","Hardik Bhatnagar","Joseph Bloom"],"pdf_url":"https://arxiv.org/pdf/2409.14507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17027v1","updated":"2024-09-25T15:30:24Z","published":"2024-09-25T15:30:24Z","title":"Counterfactual Token Generation in Large Language Models","summary":" \"Sure, I am happy to generate a story for you: Captain Lyra stood at the helm\nof her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...]\nLyra's eyes welled up with tears as she realized the bitter truth - she had\nsacrificed everything for fleeting riches, and lost the love of her crew, her\nfamily, and herself.\" Although this story, generated by a large language model,\nis captivating, one may wonder -- how would the story have unfolded if the\nmodel had chosen \"Captain Maeve\" as the protagonist instead? We cannot know.\nState-of-the-art large language models are stateless -- they maintain no\ninternal memory or state. Given a prompt, they generate a sequence of tokens as\nan output using an autoregressive process. As a consequence, they cannot reason\nabout counterfactual alternatives to tokens they have generated in the past. In\nthis work, our goal is to enhance them with this functionality. To this end, we\ndevelop a causal model of token generation that builds upon the Gumbel-Max\nstructural causal model. Our model allows any large language model to perform\ncounterfactual token generation at almost no cost in comparison with vanilla\ntoken generation, it is embarrassingly simple to implement, and it does not\nrequire any fine-tuning nor prompt engineering. We implement our model on Llama\n3 8B-instruct and conduct both qualitative and quantitative analyses of\ncounterfactually generated text. We conclude with a demonstrative application\nof counterfactual token generation for bias detection, unveiling interesting\ninsights about the model of the world constructed by large language models.\n","authors":["Ivi Chatzi","Nina Corvelo Benz","Eleni Straitouri","Stratis Tsirtsis","Manuel Gomez-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2409.17027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17011v1","updated":"2024-09-25T15:15:57Z","published":"2024-09-25T15:15:57Z","title":"LLM-CARD: Towards a Description and Landscape of Large Language Models","summary":" With the rapid growth of the Natural Language Processing (NLP) field, a vast\nvariety of Large Language Models (LLMs) continue to emerge for diverse NLP\ntasks. As an increasing number of papers are presented, researchers and\ndevelopers face the challenge of information overload. Thus, it is particularly\nimportant to develop a system that can automatically extract and organise key\ninformation about LLMs from academic papers (\\textbf{LLM model card}). This\nwork is to develop such a pioneer system by using Named Entity Recognition\n(\\textbf{NER}) and Relation Extraction (\\textbf{RE}) methods that automatically\nextract key information about large language models from the papers, helping\nresearchers to efficiently access information about LLMs. These features\ninclude model \\textit{licence}, model \\textit{name}, and model\n\\textit{application}. With these features, we can form a model card for each\npaper. \\textbf{Data-contribution} wise, 106 academic papers were processed by\ndefining three dictionaries - LLMs name, licence, and application. 11,051\nsentences were extracted through dictionary lookup, and the dataset was\nconstructed through manual review of the final selection of 129 sentences that\nhave a link between the name and the licence, and 106 sentences that have a\nlink between the model name and the application.\n","authors":["Shengwei Tian","Lifeng Han","Erick Mendez Guzman","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2409.17011v1.pdf","comment":"ongoing work, 16 pages"},{"id":"http://arxiv.org/abs/2409.17005v1","updated":"2024-09-25T15:08:08Z","published":"2024-09-25T15:08:08Z","title":"Models Can and Should Embrace the Communicative Nature of\n Human-Generated Math","summary":" Math is constructed by people for people: just as natural language corpora\nreflect not just propositions but the communicative goals of language users,\nthe math data that models are trained on reflects not just idealized\nmathematical entities but rich communicative intentions. While there are\nimportant advantages to treating math in a purely symbolic manner, we here\nhypothesize that there are benefits to treating math as situated linguistic\ncommunication and that language models are well suited for this goal, in ways\nthat are not fully appreciated. We illustrate these points with two case\nstudies. First, we ran an experiment in which we found that language models\ninterpret the equals sign in a humanlike way -- generating systematically\ndifferent word problems for the same underlying equation arranged in different\nways. Second, we found that language models prefer proofs to be ordered in\nnaturalistic ways, even though other orders would be logically equivalent. We\nadvocate for AI systems that learn from and represent the communicative\nintentions latent in human-generated math.\n","authors":["Sasha Boguraev","Ben Lipkin","Leonie Weissweiler","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2409.17005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03589v3","updated":"2024-09-25T14:59:24Z","published":"2024-06-05T19:14:21Z","title":"Ranking Manipulation for Conversational Search Engines","summary":" Major search engine providers are rapidly incorporating Large Language Model\n(LLM)-generated content in response to user queries. These conversational\nsearch engines operate by loading retrieved website text into the LLM context\nfor summarization and interpretation. Recent research demonstrates that LLMs\nare highly vulnerable to jailbreaking and prompt injection attacks, which\ndisrupt the safety and quality goals of LLMs using adversarial strings. This\nwork investigates the impact of prompt injections on the ranking order of\nsources referenced by conversational search engines. To this end, we introduce\na focused dataset of real-world consumer product websites and formalize\nconversational search ranking as an adversarial problem. Experimentally, we\nanalyze conversational search rankings in the absence of adversarial injections\nand show that different LLMs vary significantly in prioritizing product name,\ndocument content, and context position. We then present a tree-of-attacks-based\njailbreaking technique which reliably promotes low-ranked products.\nImportantly, these attacks transfer effectively to state-of-the-art\nconversational search engines such as perplexity$.$ai. Given the strong\nfinancial incentive for website owners to boost their search ranking, we argue\nthat our problem formulation is of critical importance for future robustness\nwork.\n","authors":["Samuel Pfrommer","Yatong Bai","Tanmay Gautam","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2406.03589v3.pdf","comment":"2024 Conference on Empirical Methods in Natural Language Processing\n (Main)"},{"id":"http://arxiv.org/abs/2409.16984v1","updated":"2024-09-25T14:45:52Z","published":"2024-09-25T14:45:52Z","title":"AXCEL: Automated eXplainable Consistency Evaluation using LLMs","summary":" Large Language Models (LLMs) are widely used in both industry and academia\nfor various tasks, yet evaluating the consistency of generated text responses\ncontinues to be a challenge. Traditional metrics like ROUGE and BLEU show a\nweak correlation with human judgment. More sophisticated metrics using Natural\nLanguage Inference (NLI) have shown improved correlations but are complex to\nimplement, require domain-specific training due to poor cross-domain\ngeneralization, and lack explainability. More recently, prompt-based metrics\nusing LLMs as evaluators have emerged; while they are easier to implement, they\nstill lack explainability and depend on task-specific prompts, which limits\ntheir generalizability. This work introduces Automated eXplainable Consistency\nEvaluation using LLMs (AXCEL), a prompt-based consistency metric which offers\nexplanations for the consistency scores by providing detailed reasoning and\npinpointing inconsistent text spans. AXCEL is also a generalizable metric which\ncan be adopted to multiple tasks without changing the prompt. AXCEL outperforms\nboth non-prompt and prompt-based state-of-the-art (SOTA) metrics in detecting\ninconsistencies across summarization by 8.7%, free text generation by 6.2%, and\ndata-to-text conversion tasks by 29.4%. We also evaluate the influence of\nunderlying LLMs on prompt based metric performance and recalibrate the SOTA\nprompt-based metrics with the latest LLMs for fair comparison. Further, we show\nthat AXCEL demonstrates strong performance using open source LLMs.\n","authors":["P Aditya Sreekar","Sahil Verma","Suransh Chopra","Sarik Ghazarian","Abhishek Persad","Narayanan Sadagopan"],"pdf_url":"https://arxiv.org/pdf/2409.16984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12997v5","updated":"2024-09-25T14:37:39Z","published":"2024-02-20T13:25:16Z","title":"Towards Trustworthy Reranking: A Simple yet Effective Abstention\n Mechanism","summary":" Neural Information Retrieval (NIR) has significantly improved upon\nheuristic-based Information Retrieval (IR) systems. Yet, failures remain\nfrequent, the models used often being unable to retrieve documents relevant to\nthe user's query. We address this challenge by proposing a lightweight\nabstention mechanism tailored for real-world constraints, with particular\nemphasis placed on the reranking phase. We introduce a protocol for evaluating\nabstention strategies in black-box scenarios (typically encountered when\nrelying on API services), demonstrating their efficacy, and propose a simple\nyet effective data-driven mechanism. We provide open-source code for experiment\nreplication and abstention implementation, fostering wider adoption and\napplication in diverse contexts.\n","authors":["Hippolyte Gisserot-Boukhlef","Manuel Faysse","Emmanuel Malherbe","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.12997v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16974v1","updated":"2024-09-25T14:36:30Z","published":"2024-09-25T14:36:30Z","title":"Decoding Large-Language Models: A Systematic Overview of Socio-Technical\n Impacts, Constraints, and Emerging Questions","summary":" There have been rapid advancements in the capabilities of large language\nmodels (LLMs) in recent years, greatly revolutionizing the field of natural\nlanguage processing (NLP) and artificial intelligence (AI) to understand and\ninteract with human language. Therefore, in this work, we conduct a systematic\ninvestigation of the literature to identify the prominent themes and directions\nof LLM developments, impacts, and limitations. Our findings illustrate the\naims, methodologies, limitations, and future directions of LLM research. It\nincludes responsible development considerations, algorithmic improvements,\nethical challenges, and societal implications of LLM development. Overall, this\npaper provides a rigorous and comprehensive overview of current research in LLM\nand identifies potential directions for future development. The article\nhighlights the application areas that could have a positive impact on society\nalong with the ethical considerations.\n","authors":["Zeyneb N. Kaya","Souvick Ghosh"],"pdf_url":"https://arxiv.org/pdf/2409.16974v1.pdf","comment":"28 pages, 5 figures, preprint submitted to journal"},{"id":"http://arxiv.org/abs/2409.16973v1","updated":"2024-09-25T14:35:06Z","published":"2024-09-25T14:35:06Z","title":"Adaptive Self-Supervised Learning Strategies for Dynamic On-Device LLM\n Personalization","summary":" Large language models (LLMs) have revolutionized how we interact with\ntechnology, but their personalization to individual user preferences remains a\nsignificant challenge, particularly in on-device applications. Traditional\nmethods often depend heavily on labeled datasets and can be resource-intensive.\nTo address these issues, we present Adaptive Self-Supervised Learning\nStrategies (ASLS), which utilizes self-supervised learning techniques to\npersonalize LLMs dynamically. The framework comprises a user profiling layer\nfor collecting interaction data and a neural adaptation layer for real-time\nmodel fine-tuning. This innovative approach enables continuous learning from\nuser feedback, allowing the model to generate responses that align closely with\nuser-specific contexts. The adaptive mechanisms of ASLS minimize computational\ndemands and enhance personalization efficiency. Experimental results across\nvarious user scenarios illustrate the superior performance of ASLS in boosting\nuser engagement and satisfaction, highlighting its potential to redefine LLMs\nas highly responsive and context-aware systems on-device.\n","authors":["Rafael Mendoza","Isabella Cruz","Richard Liu","Aarav Deshmukh","David Williams","Jesscia Peng","Rohan Iyer"],"pdf_url":"https://arxiv.org/pdf/2409.16973v1.pdf","comment":"First ASLS"},{"id":"http://arxiv.org/abs/2409.16954v1","updated":"2024-09-25T14:09:09Z","published":"2024-09-25T14:09:09Z","title":"Weighted Cross-entropy for Low-Resource Languages in Multilingual Speech\n Recognition","summary":" This paper addresses the challenge of integrating low-resource languages into\nmultilingual automatic speech recognition (ASR) systems. We introduce a novel\napplication of weighted cross-entropy, typically used for unbalanced datasets,\nto facilitate the integration of low-resource languages into pre-trained\nmultilingual ASR models within the context of continual multilingual learning.\nWe fine-tune the Whisper multilingual ASR model on five high-resource languages\nand one low-resource language, employing language-weighted dynamic\ncross-entropy and data augmentation. The results show a remarkable 6.69% word\nerror rate (WER) reduction for the low-resource language compared to the\nfine-tuned model without applying our approach, and a 48.86% WER reduction\ncompared to the original Whisper model. In addition, our approach yields an\naverage WER reduction of 3.29% across the six languages, showing no degradation\nfor the high-resource languages.\n","authors":["Andrés Piñeiro-Martín","Carmen García-Mateo","Laura Docío-Fernández","María del Carmen López-Pérez","Georg Rehm"],"pdf_url":"https://arxiv.org/pdf/2409.16954v1.pdf","comment":"5 pages, 1 figure. Presented at Interspeech 2024"},{"id":"http://arxiv.org/abs/2305.12620v2","updated":"2024-09-25T14:06:31Z","published":"2023-05-22T01:02:45Z","title":"Keeping Up with the Language Models: Systematic Benchmark Extension for\n Bias Auditing","summary":" Bias auditing of language models (LMs) has received considerable attention as\nLMs are becoming widespread. As such, several benchmarks for bias auditing have\nbeen proposed. At the same time, the rapid evolution of LMs can make these\nbenchmarks irrelevant in no time. Bias auditing is further complicated by LM\nbrittleness: when a presumably biased outcome is observed, is it due to model\nbias or model brittleness? We propose enlisting the models themselves to help\nconstruct bias auditing datasets that remain challenging, and introduce bias\nmeasures that distinguish between different types of model errors. First, we\nextend an existing bias benchmark for NLI (BBNLI) using a combination of\nLM-generated lexical variations, adversarial filtering, and human validation.\nWe demonstrate that the newly created dataset BBNLI-next is more challenging\nthan BBNLI: on average, BBNLI-next reduces the accuracy of state-of-the-art NLI\nmodels from 95.3%, as observed by BBNLI, to a strikingly low 57.5%. Second, we\nemploy BBNLI-next to showcase the interplay between robustness and bias: we\npoint out shortcomings in current bias scores and propose bias measures that\ntake into account both bias and model brittleness. Third, despite the fact that\nBBNLI-next was designed with non-generative models in mind, we show that the\nnew dataset is also able to uncover bias in state-of-the-art open-source\ngenerative LMs.\n Note: All datasets included in this work are in English and they address\nUS-centered social biases. In the spirit of efficient NLP research, no model\ntraining or fine-tuning was performed to conduct this research.\n Warning: This paper contains offensive text examples.\n","authors":["Ioana Baldini","Chhavi Yadav","Manish Nagireddy","Payel Das","Kush R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2305.12620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02135v2","updated":"2024-09-25T14:00:18Z","published":"2024-06-04T09:24:04Z","title":"Robust Interaction-Based Relevance Modeling for Online e-Commerce Search","summary":" Semantic relevance calculation is crucial for e-commerce search engines, as\nit ensures that the items selected closely align with customer intent.\nInadequate attention to this aspect can detrimentally affect user experience\nand engagement. Traditional text-matching techniques are prevalent but often\nfail to capture the nuances of search intent accurately, so neural networks now\nhave become a preferred solution to processing such complex text matching.\nExisting methods predominantly employ representation-based architectures, which\nstrike a balance between high traffic capacity and low latency. However, they\nexhibit significant shortcomings in generalization and robustness when compared\nto interaction-based architectures. In this work, we introduce a robust\ninteraction-based modeling paradigm to address these shortcomings. It\nencompasses 1) a dynamic length representation scheme for expedited inference,\n2) a professional terms recognition method to identify subjects and core\nattributes from complex sentence structures, and 3) a contrastive adversarial\ntraining protocol to bolster the model's robustness and matching capabilities.\nExtensive offline evaluations demonstrate the superior robustness and\neffectiveness of our approach, and online A/B testing confirms its ability to\nimprove relevance in the same exposure position, resulting in more clicks and\nconversions. To the best of our knowledge, this method is the first\ninteraction-based approach for large e-commerce search relevance calculation.\nNotably, we have deployed it for the entire search traffic on alibaba.com, the\nlargest B2B e-commerce platform in the world.\n","authors":["Ben Chen","Huangyu Dai","Xiang Ma","Wen Jiang","Wei Ning"],"pdf_url":"https://arxiv.org/pdf/2406.02135v2.pdf","comment":"Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7\n tables"},{"id":"http://arxiv.org/abs/2409.16937v1","updated":"2024-09-25T13:51:19Z","published":"2024-09-25T13:51:19Z","title":"Semi-Supervised Cognitive State Classification from Speech with\n Multi-View Pseudo-Labeling","summary":" The lack of labeled data is a common challenge in speech classification\ntasks, particularly those requiring extensive subjective assessment, such as\ncognitive state classification. In this work, we propose a Semi-Supervised\nLearning (SSL) framework, introducing a novel multi-view pseudo-labeling method\nthat leverages both acoustic and linguistic characteristics to select the most\nconfident data for training the classification model. Acoustically, unlabeled\ndata are compared to labeled data using the Frechet audio distance, calculated\nfrom embeddings generated by multiple audio encoders. Linguistically, large\nlanguage models are prompted to revise automatic speech recognition\ntranscriptions and predict labels based on our proposed task-specific\nknowledge. High-confidence data are identified when pseudo-labels from both\nsources align, while mismatches are treated as low-confidence data. A bimodal\nclassifier is then trained to iteratively label the low-confidence data until a\npredefined criterion is met. We evaluate our SSL framework on emotion\nrecognition and dementia detection tasks. Experimental results demonstrate that\nour method achieves competitive performance compared to fully supervised\nlearning using only 30% of the labeled data and significantly outperforms two\nselected baselines.\n","authors":["Yuanchao Li","Zixing Zhang","Jing Han","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.16937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19280v3","updated":"2024-09-25T13:36:27Z","published":"2024-06-27T15:50:41Z","title":"HuatuoGPT-Vision, Towards Injecting Medical Visual Knowledge into\n Multimodal LLMs at Scale","summary":" The rapid development of multimodal large language models (MLLMs), such as\nGPT-4V, has led to significant advancements. However, these models still face\nchallenges in medical multimodal capabilities due to limitations in the\nquantity and quality of medical vision-text data, stemming from data privacy\nconcerns and high annotation costs. While pioneering approaches utilize\nPubMed's large-scale, de-identified medical image-text pairs to address these\nlimitations, they still fall short due to inherent data noise. To tackle this,\nwe refined medical image-text pairs from PubMed and employed MLLMs (GPT-4V) in\nan 'unblinded' capacity to denoise and reformat the data, resulting in the\ncreation of the PubMedVision dataset with 1.3 million medical VQA samples. Our\nvalidation demonstrates that: (1) PubMedVision can significantly enhance the\nmedical multimodal capabilities of current MLLMs, showing significant\nimprovement in benchmarks including the MMMU Health & Medicine track; (2)\nmanual checks by medical experts and empirical results validate the superior\ndata quality of our dataset compared to other data construction methods. Using\nPubMedVision, we train a 34B medical MLLM HuatuoGPT-Vision, which shows\nsuperior performance in medical multimodal scenarios among open-source MLLMs.\n","authors":["Junying Chen","Chi Gui","Ruyi Ouyang","Anningzhe Gao","Shunian Chen","Guiming Hardy Chen","Xidong Wang","Ruifei Zhang","Zhenyang Cai","Ke Ji","Guangjun Yu","Xiang Wan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19280v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16920v1","updated":"2024-09-25T13:27:17Z","published":"2024-09-25T13:27:17Z","title":"Cross-lingual Speech Emotion Recognition: Humans vs. Self-Supervised\n Models","summary":" Utilizing Self-Supervised Learning (SSL) models for Speech Emotion\nRecognition (SER) has proven effective, yet limited research has explored\ncross-lingual scenarios. This study presents a comparative analysis between\nhuman performance and SSL models, beginning with a layer-wise analysis and an\nexploration of parameter-efficient fine-tuning strategies in monolingual,\ncross-lingual, and transfer learning contexts. We further compare the SER\nability of models and humans at both utterance- and segment-levels.\nAdditionally, we investigate the impact of dialect on cross-lingual SER through\nhuman evaluation. Our findings reveal that models, with appropriate knowledge\ntransfer, can adapt to the target language and achieve performance comparable\nto native speakers. We also demonstrate the significant effect of dialect on\nSER for individuals without prior linguistic and paralinguistic background.\nMoreover, both humans and models exhibit distinct behaviors across different\nemotions. These results offer new insights into the cross-lingual SER\ncapabilities of SSL models, underscoring both their similarities to and\ndifferences from human emotion perception.\n","authors":["Zhichen Han","Tianqi Geng","Hui Feng","Jiahong Yuan","Korin Richmond","Yuanchao Li"],"pdf_url":"https://arxiv.org/pdf/2409.16920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16914v1","updated":"2024-09-25T13:18:57Z","published":"2024-09-25T13:18:57Z","title":"Zero-Shot Detection of LLM-Generated Text using Token Cohesiveness","summary":" The increasing capability and widespread usage of large language models\n(LLMs) highlight the desirability of automatic detection of LLM-generated text.\nZero-shot detectors, due to their training-free nature, have received\nconsiderable attention and notable success. In this paper, we identify a new\nfeature, token cohesiveness, that is useful for zero-shot detection, and we\ndemonstrate that LLM-generated text tends to exhibit higher token cohesiveness\nthan human-written text. Based on this observation, we devise TOCSIN, a generic\ndual-channel detection paradigm that uses token cohesiveness as a plug-and-play\nmodule to improve existing zero-shot detectors. To calculate token\ncohesiveness, TOCSIN only requires a few rounds of random token deletion and\nsemantic difference measurement, making it particularly suitable for a\npractical black-box setting where the source model used for generation is not\naccessible. Extensive experiments with four state-of-the-art base detectors on\nvarious datasets, source models, and evaluation settings demonstrate the\neffectiveness and generality of the proposed approach. Code available at:\n\\url{https://github.com/Shixuan-Ma/TOCSIN}.\n","authors":["Shixuan Ma","Quan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.16914v1.pdf","comment":"To appear at the main conference of EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.16911v1","updated":"2024-09-25T13:15:50Z","published":"2024-09-25T13:15:50Z","title":"Pruning Multilingual Large Language Models for Multilingual Inference","summary":" Multilingual large language models (MLLMs), trained on multilingual balanced\ndata, demonstrate better zero-shot learning performance in non-English\nlanguages compared to large language models trained on English-dominant data.\nHowever, the disparity in performance between English and non-English languages\nremains a challenge yet to be fully addressed. A distinctive characteristic of\nMLLMs is their high-quality translation capabilities, indicating an acquired\nproficiency in aligning between languages. This study explores how to enhance\nthe zero-shot performance of MLLMs in non-English languages by leveraging their\nalignment capability between English and non-English languages. To achieve\nthis, we first analyze the behavior of MLLMs when performing translation and\nreveal that there are large magnitude features that play a critical role in the\ntranslation process. Inspired by these findings, we retain the weights\nassociated with operations involving the large magnitude features and prune\nother weights to force MLLMs to rely on these features for tasks beyond\ntranslation. We empirically demonstrate that this pruning strategy can enhance\nthe MLLMs' performance in non-English language.\n","authors":["Hwichan Kim","Jun Suzuki","Tosho Hirasawa","Mamoru Komachi"],"pdf_url":"https://arxiv.org/pdf/2409.16911v1.pdf","comment":"Accepted at EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.16909v1","updated":"2024-09-25T13:13:21Z","published":"2024-09-25T13:13:21Z","title":"Enhancing Temporal Sensitivity and Reasoning for Time-Sensitive Question\n Answering","summary":" Time-Sensitive Question Answering (TSQA) demands the effective utilization of\nspecific temporal contexts, encompassing multiple time-evolving facts, to\naddress time-sensitive questions. This necessitates not only the parsing of\ntemporal information within questions but also the identification and\nunderstanding of time-evolving facts to generate accurate answers. However,\ncurrent large language models still have limited sensitivity to temporal\ninformation and their inadequate temporal reasoning capabilities.In this paper,\nwe propose a novel framework that enhances temporal awareness and reasoning\nthrough Temporal Information-Aware Embedding and Granular Contrastive\nReinforcement Learning. Experimental results on four TSQA datasets demonstrate\nthat our framework significantly outperforms existing LLMs in TSQA tasks,\nmarking a step forward in bridging the performance gap between machine and\nhuman temporal understanding and reasoning.\n","authors":["Wanqi Yang","Yanda Li","Meng Fang","Ling Chen"],"pdf_url":"https://arxiv.org/pdf/2409.16909v1.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.16900v1","updated":"2024-09-25T13:09:23Z","published":"2024-09-25T13:09:23Z","title":"A Roadmap for Embodied and Social Grounding in LLMs","summary":" The fusion of Large Language Models (LLMs) and robotic systems has led to a\ntransformative paradigm in the robotic field, offering unparalleled\ncapabilities not only in the communication domain but also in skills like\nmultimodal input handling, high-level reasoning, and plan generation. The\ngrounding of LLMs knowledge into the empirical world has been considered a\ncrucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless,\nconnecting LLMs' representations to the external world with multimodal\napproaches or with robots' bodies is not enough to let them understand the\nmeaning of the language they are manipulating. Taking inspiration from humans,\nthis work draws attention to three necessary elements for an agent to grasp and\nexperience the world. The roadmap for LLMs grounding is envisaged in an active\nbodily system as the reference point for experiencing the environment, a\ntemporally structured experience for a coherent, self-related interaction with\nthe external world, and social skills to acquire a common-grounded shared\nexperience.\n","authors":["Sara Incao","Carlo Mazzola","Giulia Belgiovine","Alessandra Sciutti"],"pdf_url":"https://arxiv.org/pdf/2409.16900v1.pdf","comment":"Accepted Version of a conference paper presented at Robophilosophy\n Conference 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.17424v1","updated":"2024-09-25T23:24:56Z","published":"2024-09-25T23:24:56Z","title":"Results of the Big ANN: NeurIPS'23 competition","summary":" The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the\nstate-of-the-art in indexing data structures and search algorithms for\npractical variants of Approximate Nearest Neighbor (ANN) search that reflect\nthe growing complexity and diversity of workloads. Unlike prior challenges that\nemphasized scaling up classical ANN search\n~\\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competition addressed filtered\nsearch, out-of-distribution data, sparse and streaming variants of ANNS.\nParticipants developed and submitted innovative solutions that were evaluated\non new standard datasets with constrained computational resources. The results\nshowcased significant improvements in search accuracy and efficiency over\nindustry-standard baselines, with notable contributions from both academic and\nindustrial teams. This paper summarizes the competition tracks, datasets,\nevaluation metrics, and the innovative approaches of the top-performing\nsubmissions, providing insights into the current advancements and future\ndirections in the field of approximate nearest neighbor search.\n","authors":["Harsha Vardhan Simhadri","Martin Aumüller","Amir Ingber","Matthijs Douze","George Williams","Magdalen Dobson Manohar","Dmitry Baranchuk","Edo Liberty","Frank Liu","Ben Landrum","Mazin Karjikar","Laxman Dhulipala","Meng Chen","Yue Chen","Rui Ma","Kai Zhang","Yuzheng Cai","Jiayang Shi","Yizhuo Chen","Weiguo Zheng","Zihao Wan","Jie Yin","Ben Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17424v1.pdf","comment":"Code:\n https://github.com/harsha-simhadri/big-ann-benchmarks/releases/tag/v0.3.0"},{"id":"http://arxiv.org/abs/2409.17402v1","updated":"2024-09-25T22:26:29Z","published":"2024-09-25T22:26:29Z","title":"Enhancing Recommendation with Denoising Auxiliary Task","summary":" The historical interaction sequences of users plays a crucial role in\ntraining recommender systems that can accurately predict user preferences.\nHowever, due to the arbitrariness of user behavior, the presence of noise in\nthese sequences poses a challenge to predicting their next actions in\nrecommender systems. To address this issue, our motivation is based on the\nobservation that training noisy sequences and clean sequences (sequences\nwithout noise) with equal weights can impact the performance of the model. We\npropose a novel self-supervised Auxiliary Task Joint Training (ATJT) method\naimed at more accurately reweighting noisy sequences in recommender systems.\nSpecifically, we strategically select subsets from users' original sequences\nand perform random replacements to generate artificially replaced noisy\nsequences. Subsequently, we perform joint training on these artificially\nreplaced noisy sequences and the original sequences. Through effective\nreweighting, we incorporate the training results of the noise recognition model\ninto the recommender model. We evaluate our method on three datasets using a\nconsistent base model. Experimental results demonstrate the effectiveness of\nintroducing self-supervised auxiliary task to enhance the base model's\nperformance.\n","authors":["Pengsheng Liu","Linan Zheng","Jiale Chen","Guangfa Zhang","Yang Xu","Jinyun Fang"],"pdf_url":"https://arxiv.org/pdf/2409.17402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17383v1","updated":"2024-09-25T21:58:08Z","published":"2024-09-25T21:58:08Z","title":"VectorSearch: Enhancing Document Retrieval with Semantic Embeddings and\n Optimized Search","summary":" Traditional retrieval methods have been essential for assessing document\nsimilarity but struggle with capturing semantic nuances. Despite advancements\nin latent semantic analysis (LSA) and deep learning, achieving comprehensive\nsemantic understanding and accurate retrieval remains challenging due to high\ndimensionality and semantic gaps. The above challenges call for new techniques\nto effectively reduce the dimensions and close the semantic gaps. To this end,\nwe propose VectorSearch, which leverages advanced algorithms, embeddings, and\nindexing techniques for refined retrieval. By utilizing innovative multi-vector\nsearch operations and encoding searches with advanced language models, our\napproach significantly improves retrieval accuracy. Experiments on real-world\ndatasets show that VectorSearch outperforms baseline metrics, demonstrating its\nefficacy for large-scale retrieval tasks.\n","authors":["Solmaz Seyed Monir","Irene Lau","Shubing Yang","Dongfang Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.17383v1.pdf","comment":"10 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.05840v3","updated":"2024-09-25T20:50:00Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v3.pdf","comment":"Make the last little additions to the draft"},{"id":"http://arxiv.org/abs/2306.04833v2","updated":"2024-09-25T17:01:50Z","published":"2023-06-07T23:24:50Z","title":"Unified Embedding Based Personalized Retrieval in Etsy Search","summary":" Embedding-based neural retrieval is a prevalent approach to address the\nsemantic gap problem which often arises in product search on tail queries. In\ncontrast, popular queries typically lack context and have a broad intent where\nadditional context from users historical interaction can be helpful. In this\npaper, we share our novel approach to address both: the semantic gap problem\nfollowed by an end to end trained model for personalized semantic retrieval. We\npropose learning a unified embedding model incorporating graph, transformer and\nterm-based embeddings end to end and share our design choices for optimal\ntradeoff between performance and efficiency. We share our learnings in feature\nengineering, hard negative sampling strategy, and application of transformer\nmodel, including a novel pre-training strategy and other tricks for improving\nsearch relevance and deploying such a model at industry scale. Our personalized\nretrieval model significantly improves the overall search experience, as\nmeasured by a 5.58% increase in search purchase rate and a 2.63% increase in\nsite-wide conversion rate, aggregated across multiple A/B tests - on live\ntraffic.\n","authors":["Rishikesh Jha","Siddharth Subramaniyam","Ethan Benjamin","Thrivikrama Taula"],"pdf_url":"https://arxiv.org/pdf/2306.04833v2.pdf","comment":"To appear at FMLDS 2024"},{"id":"http://arxiv.org/abs/2402.12997v5","updated":"2024-09-25T14:37:39Z","published":"2024-02-20T13:25:16Z","title":"Towards Trustworthy Reranking: A Simple yet Effective Abstention\n Mechanism","summary":" Neural Information Retrieval (NIR) has significantly improved upon\nheuristic-based Information Retrieval (IR) systems. Yet, failures remain\nfrequent, the models used often being unable to retrieve documents relevant to\nthe user's query. We address this challenge by proposing a lightweight\nabstention mechanism tailored for real-world constraints, with particular\nemphasis placed on the reranking phase. We introduce a protocol for evaluating\nabstention strategies in black-box scenarios (typically encountered when\nrelying on API services), demonstrating their efficacy, and propose a simple\nyet effective data-driven mechanism. We provide open-source code for experiment\nreplication and abstention implementation, fostering wider adoption and\napplication in diverse contexts.\n","authors":["Hippolyte Gisserot-Boukhlef","Manuel Faysse","Emmanuel Malherbe","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.12997v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02135v2","updated":"2024-09-25T14:00:18Z","published":"2024-06-04T09:24:04Z","title":"Robust Interaction-Based Relevance Modeling for Online e-Commerce Search","summary":" Semantic relevance calculation is crucial for e-commerce search engines, as\nit ensures that the items selected closely align with customer intent.\nInadequate attention to this aspect can detrimentally affect user experience\nand engagement. Traditional text-matching techniques are prevalent but often\nfail to capture the nuances of search intent accurately, so neural networks now\nhave become a preferred solution to processing such complex text matching.\nExisting methods predominantly employ representation-based architectures, which\nstrike a balance between high traffic capacity and low latency. However, they\nexhibit significant shortcomings in generalization and robustness when compared\nto interaction-based architectures. In this work, we introduce a robust\ninteraction-based modeling paradigm to address these shortcomings. It\nencompasses 1) a dynamic length representation scheme for expedited inference,\n2) a professional terms recognition method to identify subjects and core\nattributes from complex sentence structures, and 3) a contrastive adversarial\ntraining protocol to bolster the model's robustness and matching capabilities.\nExtensive offline evaluations demonstrate the superior robustness and\neffectiveness of our approach, and online A/B testing confirms its ability to\nimprove relevance in the same exposure position, resulting in more clicks and\nconversions. To the best of our knowledge, this method is the first\ninteraction-based approach for large e-commerce search relevance calculation.\nNotably, we have deployed it for the entire search traffic on alibaba.com, the\nlargest B2B e-commerce platform in the world.\n","authors":["Ben Chen","Huangyu Dai","Xiang Ma","Wen Jiang","Wei Ning"],"pdf_url":"https://arxiv.org/pdf/2406.02135v2.pdf","comment":"Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7\n tables"},{"id":"http://arxiv.org/abs/2304.08851v2","updated":"2024-09-25T12:57:08Z","published":"2023-04-18T09:37:22Z","title":"A Personality-Guided Preference Aggregator for Ephemeral Group\n Recommendation","summary":" Ephemeral group recommendation (EGR) aims to suggest items for a group of\nusers who come together for the first time. Existing work typically consider\nindividual preferences as the sole factor in aggregating group preferences.\nHowever, they neglect to take into account the importance of the individual\ninherent factors, such as personality, and thus fail to accurately simulate the\ngroup decision-making process. Additionally, these methods often struggle due\nto insufficient interactive records. To tackle these issues, a\nPersonality-Guided Preference Aggregator (PEGA) is proposed, which guides the\npreference aggregation of group members based on their personalities, rather\nthan relying solely on their preferences. Specifically, implicit personalities\nare first extracted from user reviews. Hyper-rectangles are then used to\naggregate individual personalities to obtain the \"Group Personality\", which\nallows for the learning of personality distributions within the group.\nSubsequently, a personality attention mechanism is employed to aggregate group\npreferences, and a preference-based fine-tuning module is used to balance the\nweights of personality and preferences. The role of personality in this\napproach is twofold: (1) To estimate the importance of individual users in a\ngroup and provide explainability; (2) To alleviate the data sparsity issue\nencountered in ephemeral groups. Experimental results demonstrate that, on four\nreal-world datasets, the PEGA model significantly outperforms related baseline\nmodels in terms of classification accuracy and interpretability. Moreover,\nempirical evidence supports the idea that personality plays a pivotal role in\nenhancing the performance of EGR tasks.\n","authors":["Guangze Ye","Wen Wu","Liye Shi","Wenxin Hu","Xin Chen","Liang He"],"pdf_url":"https://arxiv.org/pdf/2304.08851v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16793v1","updated":"2024-09-25T10:14:01Z","published":"2024-09-25T10:14:01Z","title":"Spacewalker: Traversing Representation Spaces for Fast Interactive\n Exploration and Annotation of Unstructured Data","summary":" Unstructured data in industries such as healthcare, finance, and\nmanufacturing presents significant challenges for efficient analysis and\ndecision making. Detecting patterns within this data and understanding their\nimpact is critical but complex without the right tools. Traditionally, these\ntasks relied on the expertise of data analysts or labor-intensive manual\nreviews. In response, we introduce Spacewalker, an interactive tool designed to\nexplore and annotate data across multiple modalities. Spacewalker allows users\nto extract data representations and visualize them in low-dimensional spaces,\nenabling the detection of semantic similarities. Through extensive user\nstudies, we assess Spacewalker's effectiveness in data annotation and integrity\nverification. Results show that the tool's ability to traverse latent spaces\nand perform multi-modal queries significantly enhances the user's capacity to\nquickly identify relevant data. Moreover, Spacewalker allows for annotation\nspeed-ups far superior to conventional methods, making it a promising tool for\nefficiently navigating unstructured data and improving decision making\nprocesses. The code of this work is open-source and can be found at:\nhttps://github.com/code-lukas/Spacewalker\n","authors":["Lukas Heine","Fabian Hörst","Jana Fragemann","Gijs Luijten","Miriam Balzer","Jan Egger","Fin Bahnsen","M. Saquib Sarfraz","Jens Kleesiek","Constantin Seibold"],"pdf_url":"https://arxiv.org/pdf/2409.16793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16332v2","updated":"2024-09-25T09:36:49Z","published":"2024-06-24T06:10:13Z","title":"DemoRank: Selecting Effective Demonstrations for Large Language Models\n in Ranking Task","summary":" Recently, there has been increasing interest in applying large language\nmodels (LLMs) as zero-shot passage rankers. However, few studies have explored\nhow to select appropriate in-context demonstrations for the passage ranking\ntask, which is the focus of this paper. Previous studies mainly use LLM's\nfeedback to train a retriever for demonstration selection. These studies apply\nthe LLM to score each demonstration independently, which ignores the\ndependencies between demonstrations (especially important in ranking task),\nleading to inferior performance of top-$k$ retrieved demonstrations. To\nmitigate this issue, we introduce a demonstration reranker to rerank the\nretrieved demonstrations so that top-$k$ ranked ones are more suitable for ICL.\nHowever, generating training data for such reranker is quite challenging. On\nthe one hand, different from demonstration retriever, the training samples of\nreranker need to incorporate demonstration dependencies. On the other hand,\nobtaining the gold ranking from the retrieved demonstrations is an NP-hard\nproblem, which is hard to implement. To overcome these challenges, we propose a\nmethod to approximate the optimal demonstration list iteratively and utilize\nLLM to score demonstration lists of varying lengths. By doing so, the search\nspace is greatly reduced and demonstration dependencies are considered. Based\non these scored demonstration lists, we further design a list-pairwise training\napproach which compares a pair of lists that only differ in the last\ndemonstration, to teach the reranker how to select the next demonstration given\na previous sequence. In this paper, we propose a demonstration selection\nframework DemoRank for ranking task and conduct extensive experiments to prove\nits strong ability.\n","authors":["Wenhan Liu","Yutao Zhu","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2406.16332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07054v2","updated":"2024-09-25T09:19:00Z","published":"2023-11-13T03:42:17Z","title":"A Study of Implicit Ranking Unfairness in Large Language Models","summary":" Recently, Large Language Models (LLMs) have demonstrated a superior ability\nto serve as ranking models. However, concerns have arisen as LLMs will exhibit\ndiscriminatory ranking behaviors based on users' sensitive attributes (\\eg\ngender). Worse still, in this paper, we identify a subtler form of\ndiscrimination in LLMs, termed \\textit{implicit ranking unfairness}, where LLMs\nexhibit discriminatory ranking patterns based solely on non-sensitive user\nprofiles, such as user names. Such implicit unfairness is more widespread but\nless noticeable, threatening the ethical foundation. To comprehensively explore\nsuch unfairness, our analysis will focus on three research aspects: (1) We\npropose an evaluation method to investigate the severity of implicit ranking\nunfairness. (2) We uncover the reasons for causing such unfairness. (3) To\nmitigate such unfairness effectively, we utilize a pair-wise regression method\nto conduct fair-aware data augmentation for LLM fine-tuning. The experiment\ndemonstrates that our method outperforms existing approaches in ranking\nfairness, achieving this with only a small reduction in accuracy. Lastly, we\nemphasize the need for the community to identify and mitigate the implicit\nunfairness, aiming to avert the potential deterioration in the reinforced\nhuman-LLMs ecosystem deterioration.\n","authors":["Chen Xu","Wenjie Wang","Yuxin Li","Liang Pang","Jun Xu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.07054v2.pdf","comment":"Accepted in EMNLP 2024 findings"},{"id":"http://arxiv.org/abs/2409.16760v1","updated":"2024-09-25T09:16:46Z","published":"2024-09-25T09:16:46Z","title":"Enhancing Automatic Keyphrase Labelling with Text-to-Text Transfer\n Transformer (T5) Architecture: A Framework for Keyphrase Generation and\n Filtering","summary":" Automatic keyphrase labelling stands for the ability of models to retrieve\nwords or short phrases that adequately describe documents' content. Previous\nwork has put much effort into exploring extractive techniques to address this\ntask; however, these methods cannot produce keyphrases not found in the text.\nGiven this limitation, keyphrase generation approaches have arisen lately. This\npaper presents a keyphrase generation model based on the Text-to-Text Transfer\nTransformer (T5) architecture. Having a document's title and abstract as input,\nwe learn a T5 model to generate keyphrases which adequately define its content.\nWe name this model docT5keywords. We not only perform the classic inference\napproach, where the output sequence is directly selected as the predicted\nvalues, but we also report results from a majority voting approach. In this\napproach, multiple sequences are generated, and the keyphrases are ranked based\non their frequency of occurrence across these sequences. Along with this model,\nwe present a novel keyphrase filtering technique based on the T5 architecture.\nWe train a T5 model to learn whether a given keyphrase is relevant to a\ndocument. We devise two evaluation methodologies to prove our model's\ncapability to filter inadequate keyphrases. First, we perform a binary\nevaluation where our model has to predict if a keyphrase is relevant for a\ngiven document. Second, we filter the predicted keyphrases by several AKG\nmodels and check if the evaluation scores are improved. Experimental results\ndemonstrate that our keyphrase generation model significantly outperforms all\nthe baselines, with gains exceeding 100\\% in some cases. The proposed filtering\ntechnique also achieves near-perfect accuracy in eliminating false positives\nacross all datasets.\n","authors":["Jorge Gabín","M. Eduardo Ares","Javier Parapar"],"pdf_url":"https://arxiv.org/pdf/2409.16760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14913v2","updated":"2024-09-25T08:52:49Z","published":"2024-09-23T11:08:04Z","title":"Towards a Realistic Long-Term Benchmark for Open-Web Research Agents","summary":" We present initial results of a forthcoming benchmark for evaluating LLM\nagents on white-collar tasks of economic value. We evaluate agents on\nreal-world \"messy\" open-web research tasks of the type that are routine in\nfinance and consulting. In doing so, we lay the groundwork for an LLM agent\nevaluation suite where good performance directly corresponds to a large\neconomic and societal impact. We built and tested several agent architectures\nwith o1-preview, GPT-4o, Claude-3.5 Sonnet, Llama 3.1 (405b), and GPT-4o-mini.\nOn average, LLM agents powered by Claude-3.5 Sonnet and o1-preview\nsubstantially outperformed agents using GPT-4o, with agents based on Llama 3.1\n(405b) and GPT-4o-mini lagging noticeably behind. Across LLMs, a ReAct\narchitecture with the ability to delegate subtasks to subagents performed best.\nIn addition to quantitative evaluations, we qualitatively assessed the\nperformance of the LLM agents by inspecting their traces and reflecting on\ntheir observations. Our evaluation represents the first in-depth assessment of\nagents' abilities to conduct challenging, economically valuable analyst-style\nresearch on the real open web.\n","authors":["Peter Mühlbacher","Nikos I. Bosse","Lawrence Phillips"],"pdf_url":"https://arxiv.org/pdf/2409.14913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16674v1","updated":"2024-09-25T07:06:14Z","published":"2024-09-25T07:06:14Z","title":"A Prompting-Based Representation Learning Method for Recommendation with\n Large Language Models","summary":" In recent years, Recommender Systems (RS) have witnessed a transformative\nshift with the advent of Large Language Models (LLMs) in the field of Natural\nLanguage Processing (NLP). Models such as GPT-3.5/4, Llama, have demonstrated\nunprecedented capabilities in understanding and generating human-like text. The\nextensive information pre-trained by these LLMs allows for the potential to\ncapture a more profound semantic representation from different contextual\ninformation of users and items.\n While the great potential lies behind the thriving of LLMs, the challenge of\nleveraging user-item preferences from contextual information and its alignment\nwith the improvement of Recommender Systems needs to be addressed. Believing\nthat a better understanding of the user or item itself can be the key factor in\nimproving recommendation performance, we conduct research on generating\ninformative profiles using state-of-the-art LLMs.\n To boost the linguistic abilities of LLMs in Recommender Systems, we\nintroduce the Prompting-Based Representation Learning Method for Recommendation\n(P4R). In our P4R framework, we utilize the LLM prompting strategy to create\npersonalized item profiles. These profiles are then transformed into semantic\nrepresentation spaces using a pre-trained BERT model for text embedding.\nFurthermore, we incorporate a Graph Convolution Network (GCN) for collaborative\nfiltering representation. The P4R framework aligns these two embedding spaces\nin order to address the general recommendation tasks. In our evaluation, we\ncompare P4R with state-of-the-art Recommender models and assess the quality of\nprompt-based profile generation.\n","authors":["Junyi Chen","Toyotaro Suzumura"],"pdf_url":"https://arxiv.org/pdf/2409.16674v1.pdf","comment":"Risks: The 1st International Workshop on Risks, Opportunities, and\n Evaluation of Generative Models in Recommendation"},{"id":"http://arxiv.org/abs/2403.00781v3","updated":"2024-09-25T06:31:09Z","published":"2024-02-18T06:07:17Z","title":"ChatDiet: Empowering Personalized Nutrition-Oriented Food Recommender\n Chatbots through an LLM-Augmented Framework","summary":" The profound impact of food on health necessitates advanced\nnutrition-oriented food recommendation services. Conventional methods often\nlack the crucial elements of personalization, explainability, and\ninteractivity. While Large Language Models (LLMs) bring interpretability and\nexplainability, their standalone use falls short of achieving true\npersonalization. In this paper, we introduce ChatDiet, a novel LLM-powered\nframework designed specifically for personalized nutrition-oriented food\nrecommendation chatbots. ChatDiet integrates personal and population models,\ncomplemented by an orchestrator, to seamlessly retrieve and process pertinent\ninformation. The personal model leverages causal discovery and inference\ntechniques to assess personalized nutritional effects for a specific user,\nwhereas the population model provides generalized information on food\nnutritional content. The orchestrator retrieves, synergizes and delivers the\noutput of both models to the LLM, providing tailored food recommendations\ndesigned to support targeted health outcomes. The result is a dynamic delivery\nof personalized and explainable food recommendations, tailored to individual\nuser preferences. Our evaluation of ChatDiet includes a compelling case study,\nwhere we establish a causal personal model to estimate individual nutrition\neffects. Our assessments, including a food recommendation test showcasing a\n92\\% effectiveness rate, coupled with illustrative dialogue examples,\nunderscore ChatDiet's strengths in explainability, personalization, and\ninteractivity.\n","authors":["Zhongqi Yang","Elahe Khatibi","Nitish Nagesh","Mahyar Abbasian","Iman Azimi","Ramesh Jain","Amir M. Rahmani"],"pdf_url":"https://arxiv.org/pdf/2403.00781v3.pdf","comment":"Published on Smart Health"},{"id":"http://arxiv.org/abs/2409.16633v1","updated":"2024-09-25T05:23:26Z","published":"2024-09-25T05:23:26Z","title":"PIFS-Rec: Process-In-Fabric-Switch for Large-Scale Recommendation System\n Inferences","summary":" Deep Learning Recommendation Models (DLRMs) have become increasingly popular\nand prevalent in today's datacenters, consuming most of the AI inference\ncycles. The performance of DLRMs is heavily influenced by available bandwidth\ndue to their large vector sizes in embedding tables and concurrent accesses. To\nachieve substantial improvements over existing solutions, novel approaches\ntowards DLRM optimization are needed, especially, in the context of emerging\ninterconnect technologies like CXL. This study delves into exploring\nCXL-enabled systems, implementing a process-in-fabric-switch (PIFS) solution to\naccelerate DLRMs while optimizing their memory and bandwidth scalability. We\npresent an in-depth characterization of industry-scale DLRM workloads running\non CXL-ready systems, identifying the predominant bottlenecks in existing CXL\nsystems. We, therefore, propose PIFS-Rec, a PIFS-based scheme that implements\nnear-data processing through downstream ports of the fabric switch. PIFS-Rec\nachieves a latency that is 3.89x lower than Pond, an industry-standard\nCXL-based system, and also outperforms BEACON, a state-of-the-art scheme, by\n2.03x.\n","authors":["Pingyi Huo","Anusha Devulapally","Hasan Al Maruf","Minseo Park","Krishnakumar Nair","Meena Arunachalam","Gulsum Gudukbay Akbulut","Mahmut Taylan Kandemir","Vijaykrishnan Narayanan"],"pdf_url":"https://arxiv.org/pdf/2409.16633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16627v1","updated":"2024-09-25T05:12:07Z","published":"2024-09-25T05:12:07Z","title":"Train Once, Deploy Anywhere: Matryoshka Representation Learning for\n Multimodal Recommendation","summary":" Despite recent advancements in language and vision modeling, integrating rich\nmultimodal knowledge into recommender systems continues to pose significant\nchallenges. This is primarily due to the need for efficient recommendation,\nwhich requires adaptive and interactive responses. In this study, we focus on\nsequential recommendation and introduce a lightweight framework called\nfull-scale Matryoshka representation learning for multimodal recommendation\n(fMRLRec). Our fMRLRec captures item features at different granularities,\nlearning informative representations for efficient recommendation across\nmultiple dimensions. To integrate item features from diverse modalities,\nfMRLRec employs a simple mapping to project multimodal item features into an\naligned feature space. Additionally, we design an efficient linear\ntransformation that embeds smaller features into larger ones, substantially\nreducing memory requirements for large-scale training on recommendation data.\nCombined with improved state space modeling techniques, fMRLRec scales to\ndifferent dimensions and only requires one-time training to produce multiple\nmodels tailored to various granularities. We demonstrate the effectiveness and\nefficiency of fMRLRec on multiple benchmark datasets, which consistently\nachieves superior performance over state-of-the-art baseline methods.\n","authors":["Yueqi Wang","Zhenrui Yue","Huimin Zeng","Dong Wang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.16627v1.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.16605v1","updated":"2024-09-25T04:12:38Z","published":"2024-09-25T04:12:38Z","title":"Evaluating and Enhancing Large Language Models for Novelty Assessment in\n Scholarly Publications","summary":" Recent studies have evaluated the creativity/novelty of large language models\n(LLMs) primarily from a semantic perspective, using benchmarks from cognitive\nscience. However, accessing the novelty in scholarly publications is a largely\nunexplored area in evaluating LLMs. In this paper, we introduce a scholarly\nnovelty benchmark (SchNovel) to evaluate LLMs' ability to assess novelty in\nscholarly papers. SchNovel consists of 15000 pairs of papers across six fields\nsampled from the arXiv dataset with publication dates spanning 2 to 10 years\napart. In each pair, the more recently published paper is assumed to be more\nnovel. Additionally, we propose RAG-Novelty, which simulates the review process\ntaken by human reviewers by leveraging the retrieval of similar papers to\nassess novelty. Extensive experiments provide insights into the capabilities of\ndifferent LLMs to assess novelty and demonstrate that RAG-Novelty outperforms\nrecent baseline models.\n","authors":["Ethan Lin","Zhiyuan Peng","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2409.16605v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.16594v1","updated":"2024-09-25T03:39:14Z","published":"2024-09-25T03:39:14Z","title":"Generative Pre-trained Ranking Model with Over-parameterization at\n Web-Scale (Extended Abstract)","summary":" Learning to rank (LTR) is widely employed in web searches to prioritize\npertinent webpages from retrieved content based on input queries. However,\ntraditional LTR models encounter two principal obstacles that lead to\nsuboptimal performance: (1) the lack of well-annotated query-webpage pairs with\nranking scores covering a diverse range of search query popularities, which\nhampers their ability to address queries across the popularity spectrum, and\n(2) inadequately trained models that fail to induce generalized representations\nfor LTR, resulting in overfitting. To address these challenges, we propose a\n\\emph{\\uline{G}enerative \\uline{S}emi-\\uline{S}upervised \\uline{P}re-trained}\n(GS2P) LTR model. We conduct extensive offline experiments on both a publicly\navailable dataset and a real-world dataset collected from a large-scale search\nengine. Furthermore, we deploy GS2P in a large-scale web search engine with\nrealistic traffic, where we observe significant improvements in the real-world\napplication.\n","authors":["Yuchen Li","Haoyi Xiong","Linghe Kong","Jiang Bian","Shuaiqiang Wang","Guihai Chen","Dawei Yin"],"pdf_url":"https://arxiv.org/pdf/2409.16594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16590v1","updated":"2024-09-25T03:33:47Z","published":"2024-09-25T03:33:47Z","title":"Pre-trained Graphformer-based Ranking at Web-scale Search (Extended\n Abstract)","summary":" Both Transformer and Graph Neural Networks (GNNs) have been employed in the\ndomain of learning to rank (LTR). However, these approaches adhere to two\ndistinct yet complementary problem formulations: ranking score regression based\non query-webpage pairs, and link prediction within query-webpage bipartite\ngraphs, respectively. While it is possible to pre-train GNNs or Transformers on\nsource datasets and subsequently fine-tune them on sparsely annotated LTR\ndatasets, the distributional shifts between the pair-based and bipartite graph\ndomains present significant challenges in integrating these heterogeneous\nmodels into a unified LTR framework at web scale. To address this, we introduce\nthe novel MPGraf model, which leverages a modular and capsule-based\npre-training strategy, aiming to cohesively integrate the regression\ncapabilities of Transformers with the link prediction strengths of GNNs. We\nconduct extensive offline and online experiments to rigorously evaluate the\nperformance of MPGraf.\n","authors":["Yuchen Li","Haoyi Xiong","Linghe Kong","Zeyi Sun","Hongyang Chen","Shuaiqiang Wang","Dawei Yin"],"pdf_url":"https://arxiv.org/pdf/2409.16590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12346v2","updated":"2024-09-25T03:18:14Z","published":"2024-07-17T06:42:14Z","title":"Object-Aware Query Perturbation for Cross-Modal Image-Text Retrieval","summary":" The pre-trained vision and language (V\\&L) models have substantially improved\nthe performance of cross-modal image-text retrieval. In general, however, V\\&L\nmodels have limited retrieval performance for small objects because of the\nrough alignment between words and the small objects in the image. In contrast,\nit is known that human cognition is object-centric, and we pay more attention\nto important objects, even if they are small. To bridge this gap between the\nhuman cognition and the V\\&L model's capability, we propose a cross-modal\nimage-text retrieval framework based on ``object-aware query perturbation.''\nThe proposed method generates a key feature subspace of the detected objects\nand perturbs the corresponding queries using this subspace to improve the\nobject awareness in the image. In our proposed method, object-aware cross-modal\nimage-text retrieval is possible while keeping the rich expressive power and\nretrieval performance of existing V\\&L models without additional fine-tuning.\nComprehensive experiments on four public datasets show that our method\noutperforms conventional algorithms. Our code is publicly available at\n\\url{https://github.com/NEC-N-SOGI/query-perturbation}.\n","authors":["Naoya Sogi","Takashi Shibata","Makoto Terao"],"pdf_url":"https://arxiv.org/pdf/2407.12346v2.pdf","comment":"ECCV 2024. Code: https://github.com/NEC-N-SOGI/query-perturbation"},{"id":"http://arxiv.org/abs/2409.16576v1","updated":"2024-09-25T03:14:01Z","published":"2024-09-25T03:14:01Z","title":"FusionANNS: An Efficient CPU/GPU Cooperative Processing Architecture for\n Billion-scale Approximate Nearest Neighbor Search","summary":" Approximate nearest neighbor search (ANNS) has emerged as a crucial component\nof database and AI infrastructure. Ever-increasing vector datasets pose\nsignificant challenges in terms of performance, cost, and accuracy for ANNS\nservices. None of modern ANNS systems can address these issues simultaneously.\nWe present FusionANNS, a high-throughput, low-latency, cost-efficient, and\nhigh-accuracy ANNS system for billion-scale datasets using SSDs and only one\nentry-level GPU. The key idea of FusionANNS lies in CPU/GPU collaborative\nfiltering and re-ranking mechanisms, which significantly reduce I/O operations\nacross CPUs, GPU, and SSDs to break through the I/O performance bottleneck.\nSpecifically, we propose three novel designs: (1) multi-tiered indexing to\navoid data swapping between CPUs and GPU, (2) heuristic re-ranking to eliminate\nunnecessary I/Os and computations while guaranteeing high accuracy, and (3)\nredundant-aware I/O deduplication to further improve I/O efficiency. We\nimplement FusionANNS and compare it with the state-of-the-art SSD-based ANNS\nsystem--SPANN and GPU-accelerated in-memory ANNS system--RUMMY. Experimental\nresults show that FusionANNS achieves 1) 9.4-13.1X higher query per second\n(QPS) and 5.7-8.8X higher cost efficiency compared with SPANN; 2) and 2-4.9X\nhigher QPS and 2.3-6.8X higher cost efficiency compared with RUMMY, while\nguaranteeing low latency and high accuracy.\n","authors":["Bing Tian","Haikun Liu","Yuhang Tang","Shihai Xiao","Zhuohui Duan","Xiaofei Liao","Xuecang Zhang","Junhua Zhu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16576v1.pdf","comment":"15 pages, 26 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.17256v1","updated":"2024-09-25T18:12:19Z","published":"2024-09-25T18:12:19Z","title":"AIM 2024 Challenge on Efficient Video Super-Resolution for AV1\n Compressed Content","summary":" Video super-resolution (VSR) is a critical task for enhancing low-bitrate and\nlow-resolution videos, particularly in streaming applications. While numerous\nsolutions have been developed, they often suffer from high computational\ndemands, resulting in low frame rates (FPS) and poor power efficiency,\nespecially on mobile platforms. In this work, we compile different methods to\naddress these challenges, the solutions are end-to-end real-time video\nsuper-resolution frameworks optimized for both high performance and low\nruntime. We also introduce a new test set of high-quality 4K videos to further\nvalidate the approaches. The proposed solutions tackle video up-scaling for two\napplications: 540p to 4K (x4) as a general case, and 360p to 1080p (x3) more\ntailored towards mobile devices. In both tracks, the solutions have a reduced\nnumber of parameters and operations (MACs), allow high FPS, and improve VMAF\nand PSNR over interpolation baselines. This report gauges some of the most\nefficient video super-resolution methods to date.\n","authors":["Marcos V Conde","Zhijun Lei","Wen Li","Christos Bampis","Ioannis Katsavounidis","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2409.17256v1.pdf","comment":"European Conference on Computer Vision (ECCV) 2024 - Advances in\n Image Manipulation (AIM)"},{"id":"http://arxiv.org/abs/2409.17104v1","updated":"2024-09-25T17:16:53Z","published":"2024-09-25T17:16:53Z","title":"Language-oriented Semantic Communication for Image Transmission with\n Fine-Tuned Diffusion Model","summary":" Ubiquitous image transmission in emerging applications brings huge overheads\nto limited wireless resources. Since that text has the characteristic of\nconveying a large amount of information with very little data, the transmission\nof the descriptive text of an image can reduce the amount of transmitted data.\nIn this context, this paper develops a novel semantic communication framework\nbased on a text-2-image generative model (Gen-SC). In particular, a transmitter\nconverts the input image to textual modality data. Then the text is transmitted\nthrough a noisy channel to the receiver. The receiver then uses the received\ntext to generate images. Additionally, to improve the robustness of text\ntransmission over noisy channels, we designed a transformer-based text\ntransmission codec model. Moreover, we obtained a personalized knowledge base\nby fine-tuning the diffusion model to meet the requirements of task-oriented\ntransmission scenarios. Simulation results show that the proposed framework can\nachieve high perceptual quality with reducing the transmitted data volume by up\nto 99% and is robust to wireless channel noise in terms of portrait image\ntransmission.\n","authors":["Xinfeng Wei","Haonan Tong","Nuocheng Yang","Changchuan Yin"],"pdf_url":"https://arxiv.org/pdf/2409.17104v1.pdf","comment":"6 pages, 9 figures, accepted by Wireless Communications and Signal\n Processing (WCSP) 2024"},{"id":"http://arxiv.org/abs/2409.16937v1","updated":"2024-09-25T13:51:19Z","published":"2024-09-25T13:51:19Z","title":"Semi-Supervised Cognitive State Classification from Speech with\n Multi-View Pseudo-Labeling","summary":" The lack of labeled data is a common challenge in speech classification\ntasks, particularly those requiring extensive subjective assessment, such as\ncognitive state classification. In this work, we propose a Semi-Supervised\nLearning (SSL) framework, introducing a novel multi-view pseudo-labeling method\nthat leverages both acoustic and linguistic characteristics to select the most\nconfident data for training the classification model. Acoustically, unlabeled\ndata are compared to labeled data using the Frechet audio distance, calculated\nfrom embeddings generated by multiple audio encoders. Linguistically, large\nlanguage models are prompted to revise automatic speech recognition\ntranscriptions and predict labels based on our proposed task-specific\nknowledge. High-confidence data are identified when pseudo-labels from both\nsources align, while mismatches are treated as low-confidence data. A bimodal\nclassifier is then trained to iteratively label the low-confidence data until a\npredefined criterion is met. We evaluate our SSL framework on emotion\nrecognition and dementia detection tasks. Experimental results demonstrate that\nour method achieves competitive performance compared to fully supervised\nlearning using only 30% of the labeled data and significantly outperforms two\nselected baselines.\n","authors":["Yuanchao Li","Zixing Zhang","Jing Han","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.16937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05445v2","updated":"2024-09-25T11:02:53Z","published":"2024-08-10T05:33:05Z","title":"Navigating Weight Prediction with Diet Diary","summary":" Current research in food analysis primarily concentrates on tasks such as\nfood recognition, recipe retrieval and nutrition estimation from a single\nimage. Nevertheless, there is a significant gap in exploring the impact of food\nintake on physiological indicators (e.g., weight) over time. This paper\naddresses this gap by introducing the DietDiary dataset, which encompasses\ndaily dietary diaries and corresponding weight measurements of real users.\nFurthermore, we propose a novel task of weight prediction with a dietary diary\nthat aims to leverage historical food intake and weight to predict future\nweights. To tackle this task, we propose a model-agnostic time series\nforecasting framework. Specifically, we introduce a Unified Meal Representation\nLearning (UMRL) module to extract representations for each meal. Additionally,\nwe design a diet-aware loss function to associate food intake with weight\nvariations. By conducting experiments on the DietDiary dataset with two\nstate-of-the-art time series forecasting models, NLinear and iTransformer, we\ndemonstrate that our proposed framework achieves superior performance compared\nto the original models. We make our dataset, code, and models publicly\navailable at: https://yxg1005.github.io/weight-prediction/.\n","authors":["Yinxuan Gui","Bin Zhu","Jingjing Chen","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.05445v2.pdf","comment":"ACM MM'24 oral"},{"id":"http://arxiv.org/abs/2403.00781v3","updated":"2024-09-25T06:31:09Z","published":"2024-02-18T06:07:17Z","title":"ChatDiet: Empowering Personalized Nutrition-Oriented Food Recommender\n Chatbots through an LLM-Augmented Framework","summary":" The profound impact of food on health necessitates advanced\nnutrition-oriented food recommendation services. Conventional methods often\nlack the crucial elements of personalization, explainability, and\ninteractivity. While Large Language Models (LLMs) bring interpretability and\nexplainability, their standalone use falls short of achieving true\npersonalization. In this paper, we introduce ChatDiet, a novel LLM-powered\nframework designed specifically for personalized nutrition-oriented food\nrecommendation chatbots. ChatDiet integrates personal and population models,\ncomplemented by an orchestrator, to seamlessly retrieve and process pertinent\ninformation. The personal model leverages causal discovery and inference\ntechniques to assess personalized nutritional effects for a specific user,\nwhereas the population model provides generalized information on food\nnutritional content. The orchestrator retrieves, synergizes and delivers the\noutput of both models to the LLM, providing tailored food recommendations\ndesigned to support targeted health outcomes. The result is a dynamic delivery\nof personalized and explainable food recommendations, tailored to individual\nuser preferences. Our evaluation of ChatDiet includes a compelling case study,\nwhere we establish a causal personal model to estimate individual nutrition\neffects. Our assessments, including a food recommendation test showcasing a\n92\\% effectiveness rate, coupled with illustrative dialogue examples,\nunderscore ChatDiet's strengths in explainability, personalization, and\ninteractivity.\n","authors":["Zhongqi Yang","Elahe Khatibi","Nitish Nagesh","Mahyar Abbasian","Iman Azimi","Ramesh Jain","Amir M. Rahmani"],"pdf_url":"https://arxiv.org/pdf/2403.00781v3.pdf","comment":"Published on Smart Health"},{"id":"http://arxiv.org/abs/2404.15637v2","updated":"2024-09-25T01:17:48Z","published":"2024-04-24T04:18:31Z","title":"HybridVC: Efficient Voice Style Conversion with Text and Audio Prompts","summary":" We introduce HybridVC, a voice conversion (VC) framework built upon a\npre-trained conditional variational autoencoder (CVAE) that combines the\nstrengths of a latent model with contrastive learning. HybridVC supports text\nand audio prompts, enabling more flexible voice style conversion. HybridVC\nmodels a latent distribution conditioned on speaker embeddings acquired by a\npretrained speaker encoder and optimises style text embeddings to align with\nthe speaker style information through contrastive learning in parallel.\nTherefore, HybridVC can be efficiently trained under limited computational\nresources. Our experiments demonstrate HybridVC's superior training efficiency\nand its capability for advanced multi-modal voice style conversion. This\nunderscores its potential for widespread applications such as user-defined\npersonalised voice in various social media platforms. A comprehensive ablation\nstudy further validates the effectiveness of our method.\n","authors":["Xinlei Niu","Jing Zhang","Charles Patrick Martin"],"pdf_url":"https://arxiv.org/pdf/2404.15637v2.pdf","comment":"Proceedings of Interspeech"}]},"2024-09-24T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.16478v1","updated":"2024-09-24T21:54:22Z","published":"2024-09-24T21:54:22Z","title":"Algorithmic Drift: A Simulation Framework to Study the Effects of\n Recommender Systems on User Preferences","summary":" Digital platforms such as social media and e-commerce websites adopt\nRecommender Systems to provide value to the user. However, the social\nconsequences deriving from their adoption are still unclear. Many scholars\nargue that recommenders may lead to detrimental effects, such as\nbias-amplification deriving from the feedback loop between algorithmic\nsuggestions and users' choices. Nonetheless, the extent to which recommenders\ninfluence changes in users leaning remains uncertain. In this context, it is\nimportant to provide a controlled environment for evaluating the recommendation\nalgorithm before deployment. To address this, we propose a stochastic\nsimulation framework that mimics user-recommender system interactions in a\nlong-term scenario. In particular, we simulate the user choices by formalizing\na user model, which comprises behavioral aspects, such as the user resistance\ntowards the recommendation algorithm and their inertia in relying on the\nreceived suggestions. Additionally, we introduce two novel metrics for\nquantifying the algorithm's impact on user preferences, specifically in terms\nof drift over time. We conduct an extensive evaluation on multiple synthetic\ndatasets, aiming at testing the robustness of our framework when considering\ndifferent scenarios and hyper-parameters setting. The experimental results\nprove that the proposed methodology is effective in detecting and quantifying\nthe drift over the users preferences by means of the simulation. All the code\nand data used to perform the experiments are publicly available.\n","authors":["Erica Coppolillo","Simone Mungari","Ettore Ritacco","Francesco Fabbri","Marco Minici","Francesco Bonchi","Giuseppe Manco"],"pdf_url":"https://arxiv.org/pdf/2409.16478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16408v1","updated":"2024-09-24T19:17:15Z","published":"2024-09-24T19:17:15Z","title":"Modern Hopfield Networks meet Encoded Neural Representations --\n Addressing Practical Considerations","summary":" Content-addressable memories such as Modern Hopfield Networks (MHN) have been\nstudied as mathematical models of auto-association and storage/retrieval in the\nhuman declarative memory, yet their practical use for large-scale content\nstorage faces challenges. Chief among them is the occurrence of meta-stable\nstates, particularly when handling large amounts of high dimensional content.\nThis paper introduces Hopfield Encoding Networks (HEN), a framework that\nintegrates encoded neural representations into MHNs to improve pattern\nseparability and reduce meta-stable states. We show that HEN can also be used\nfor retrieval in the context of hetero association of images with natural\nlanguage queries, thus removing the limitation of requiring access to partial\ncontent in the same domain. Experimental results demonstrate substantial\nreduction in meta-stable states and increased storage capacity while still\nenabling perfect recall of a significantly larger number of inputs advancing\nthe practical utility of associative memory networks for real-world tasks.\n","authors":["Satyananda Kashyap","Niharika S. D'Souza","Luyao Shi","Ken C. L. Wong","Hongzhi Wang","Tanveer Syeda-Mahmood"],"pdf_url":"https://arxiv.org/pdf/2409.16408v1.pdf","comment":"17 pages, 8 figures, workshop submission to Neurips"},{"id":"http://arxiv.org/abs/2409.13711v2","updated":"2024-09-24T18:38:02Z","published":"2024-09-06T18:44:25Z","title":"WebQuest: A Benchmark for Multimodal QA on Web Page Sequences","summary":" The rise of powerful multimodal LLMs has enhanced the viability of building\nweb agents which can, with increasing levels of autonomy, assist users to\nretrieve information and complete tasks on various human-computer interfaces.\nIt is hence necessary to build challenging benchmarks that span a wide-variety\nof use cases reflecting real-world usage. In this work, we present WebQuest, a\nmulti-page question-answering dataset that requires reasoning across multiple\nrelated web pages. In contrast to existing UI benchmarks that focus on\nmulti-step web navigation and task completion, our dataset evaluates\ninformation extraction, multimodal retrieval and composition of information\nfrom many web pages. WebQuest includes three question categories: single-screen\nQA, multi-screen QA, and QA based on navigation traces. We evaluate leading\nproprietary multimodal models like GPT-4V, Gemini Flash, Claude 3, and open\nsource models like InstructBLIP, PaliGemma on our dataset, revealing a\nsignificant gap between single-screen and multi-screen reasoning. Finally, we\ninvestigate inference time techniques like Chain-of-Thought prompting to\nimprove model capabilities on multi-screen reasoning.\n","authors":["Maria Wang","Srinivas Sunkara","Gilles Baechler","Jason Lin","Yun Zhu","Fedir Zubach","Lei Shu","Jindong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.13711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00859v2","updated":"2024-09-24T16:54:35Z","published":"2024-08-01T18:17:25Z","title":"GLoCIM: Global-view Long Chain Interest Modeling for news recommendation","summary":" Accurately recommending candidate news articles to users has always been the\ncore challenge of news recommendation system. News recommendations often\nrequire modeling of user interest to match candidate news. Recent efforts have\nprimarily focused on extracting local subgraph information in a global click\ngraph constructed by the clicked news sequence of all users. Howerer, the\ncomputational complexity of extracting global click graph information has\nhindered the ability to utilize far-reaching linkage which is hidden between\ntwo distant nodes in global click graph collaboratively among similar users. To\novercome the problem above, we propose a Global-view Long Chain Interests\nModeling for news recommendation (GLoCIM), which combines neighbor interest\nwith long chain interest distilled from a global click graph, leveraging the\ncollaboration among similar users to enhance news recommendation. We therefore\ndesign a long chain selection algorithm and long chain interest encoder to\nobtain global-view long chain interest from the global click graph. We design a\ngated network to integrate long chain interest with neighbor interest to\nachieve the collaborative interest among similar users. Subsequently we\naggregate it with local news category-enhanced representation to generate final\nuser representation. Then candidate news representation can be formed to match\nuser representation to achieve news recommendation. Experimental results on\nreal-world datasets validate the effectiveness of our method to improve the\nperformance of news recommendation.\n","authors":["Zhen Yang","Wenhui Wang","Tao Qi","Peng Zhang","Tianyun Zhang","Ru Zhang","Jianyi Liu","Yongfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2408.00859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16220v1","updated":"2024-09-24T16:31:33Z","published":"2024-09-24T16:31:33Z","title":"Towards Enhancing Linked Data Retrieval in Conversational UIs using\n Large Language Models","summary":" Despite the recent broad adoption of Large Language Models (LLMs) across\nvarious domains, their potential for enriching information systems in\nextracting and exploring Linked Data (LD) and Resource Description Framework\n(RDF) triplestores has not been extensively explored. This paper examines the\nintegration of LLMs within existing systems, emphasising the enhancement of\nconversational user interfaces (UIs) and their capabilities for data extraction\nby producing more accurate SPARQL queries without the requirement for model\nretraining. Typically, conversational UI models necessitate retraining with the\nintroduction of new datasets or updates, limiting their functionality as\ngeneral-purpose extraction tools. Our approach addresses this limitation by\nincorporating LLMs into the conversational UI workflow, significantly enhancing\ntheir ability to comprehend and process user queries effectively. By leveraging\nthe advanced natural language understanding capabilities of LLMs, our method\nimproves RDF entity extraction within web systems employing conventional\nchatbots. This integration facilitates a more nuanced and context-aware\ninteraction model, critical for handling the complex query patterns often\nencountered in RDF datasets and Linked Open Data (LOD) endpoints. The\nevaluation of this methodology shows a marked enhancement in system\nexpressivity and the accuracy of responses to user queries, indicating a\npromising direction for future research in this area. This investigation not\nonly underscores the versatility of LLMs in enhancing existing information\nsystems but also sets the stage for further explorations into their potential\napplications within more specialised domains of web information systems.\n","authors":["Omar Mussa","Omer Rana","Benoît Goossens","Pablo Orozco-Terwengel","Charith Perera"],"pdf_url":"https://arxiv.org/pdf/2409.16220v1.pdf","comment":"This paper has been accepted at the 25th International Web\n Information Systems Engineering Conference (WISE 2024)"},{"id":"http://arxiv.org/abs/2409.16182v1","updated":"2024-09-24T15:26:38Z","published":"2024-09-24T15:26:38Z","title":"TiM4Rec: An Efficient Sequential Recommendation Model Based on\n Time-Aware Structured State Space Duality Model","summary":" Sequential recommendation represents a pivotal branch of recommendation\nsystems, centered around dynamically analyzing the sequential dependencies\nbetween user preferences and their interactive behaviors. Despite the\nTransformer architecture-based models achieving commendable performance within\nthis domain, their quadratic computational complexity relative to the sequence\ndimension impedes efficient modeling. In response, the innovative Mamba\narchitecture, characterized by linear computational complexity, has emerged.\nMamba4Rec further pioneers the application of Mamba in sequential\nrecommendation. Nonetheless, Mamba 1's hardware-aware algorithm struggles to\nefficiently leverage modern matrix computational units, which lead to the\nproposal of the improved State Space Duality (SSD), also known as Mamba 2.\nWhile the SSD4Rec successfully adapts the SSD architecture for sequential\nrecommendation, showing promising results in high-dimensional contexts, it\nsuffers significant performance drops in low-dimensional scenarios crucial for\npure ID sequential recommendation tasks. Addressing this challenge, we propose\na novel sequential recommendation backbone model, TiM4Rec, which ameliorates\nthe low-dimensional performance loss of the SSD architecture while preserving\nits computational efficiency. Drawing inspiration from TiSASRec, we develop a\ntime-aware enhancement method tailored for the linear computation demands of\nthe SSD architecture, thereby enhancing its adaptability and achieving\nstate-of-the-art (SOTA) performance in both low and high-dimensional modeling.\nThe code for our model is publicly accessible at\nhttps://github.com/AlwaysFHao/TiM4Rec.\n","authors":["Hao Fan","Mengyi Zhu","Yanrong Hu","Hailin Feng","Zhijie He","Hongjiu Liu","Qingyang Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16143v1","updated":"2024-09-24T14:50:21Z","published":"2024-09-24T14:50:21Z","title":"Seeing Faces in Things: A Model and Dataset for Pareidolia","summary":" The human visual system is well-tuned to detect faces of all shapes and\nsizes. While this brings obvious survival advantages, such as a better chance\nof spotting unknown predators in the bush, it also leads to spurious face\ndetections. ``Face pareidolia'' describes the perception of face-like structure\namong otherwise random stimuli: seeing faces in coffee stains or clouds in the\nsky. In this paper, we study face pareidolia from a computer vision\nperspective. We present an image dataset of ``Faces in Things'', consisting of\nfive thousand web images with human-annotated pareidolic faces. Using this\ndataset, we examine the extent to which a state-of-the-art human face detector\nexhibits pareidolia, and find a significant behavioral gap between humans and\nmachines. We find that the evolutionary need for humans to detect animal faces,\nas well as human faces, may explain some of this gap. Finally, we propose a\nsimple statistical model of pareidolia in images. Through studies on human\nsubjects and our pareidolic face detectors we confirm a key prediction of our\nmodel regarding what image conditions are most likely to induce pareidolia.\nDataset and Website: https://aka.ms/faces-in-things\n","authors":["Mark Hamilton","Simon Stent","Vasha DuTell","Anne Harrington","Jennifer Corbett","Ruth Rosenholtz","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2409.16143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16096v1","updated":"2024-09-24T13:50:32Z","published":"2024-09-24T13:50:32Z","title":"Exploring Hint Generation Approaches in Open-Domain Question Answering","summary":" Automatic Question Answering (QA) systems rely on contextual information to\nprovide accurate answers. Commonly, contexts are prepared through either\nretrieval-based or generation-based methods. The former involves retrieving\nrelevant documents from a corpus like Wikipedia, whereas the latter uses\ngenerative models such as Large Language Models (LLMs) to generate the context.\nIn this paper, we introduce a novel context preparation approach called HINTQA,\nwhich employs Automatic Hint Generation (HG) techniques. Unlike traditional\nmethods, HINTQA prompts LLMs to produce hints about potential answers for the\nquestion rather than generating relevant context. We evaluate our approach\nacross three QA datasets including TriviaQA, NaturalQuestions, and Web\nQuestions, examining how the number and order of hints impact performance. Our\nfindings show that the HINTQA surpasses both retrieval-based and\ngeneration-based approaches. We demonstrate that hints enhance the accuracy of\nanswers more than retrieved and generated contexts.\n","authors":["Jamshid Mozafari","Abdelrahman Abdallah","Bhawna Piryani","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2409.16096v1.pdf","comment":"Accepted at EMNLP 2024"},{"id":"http://arxiv.org/abs/2408.09847v3","updated":"2024-09-24T13:11:43Z","published":"2024-08-19T09:50:20Z","title":"Fashion Image-to-Image Translation for Complementary Item Retrieval","summary":" The increasing demand for online fashion retail has boosted research in\nfashion compatibility modeling and item retrieval, focusing on matching user\nqueries (textual descriptions or reference images) with compatible fashion\nitems. A key challenge is top-bottom retrieval, where precise compatibility\nmodeling is essential. Traditional methods, often based on Bayesian\nPersonalized Ranking (BPR), have shown limited performance. Recent efforts have\nexplored using generative models in compatibility modeling and item retrieval,\nwhere generated images serve as additional inputs. However, these approaches\noften overlook the quality of generated images, which could be crucial for\nmodel performance. Additionally, generative models typically require large\ndatasets, posing challenges when such data is scarce.\n To address these issues, we introduce the Generative Compatibility Model\n(GeCo), a two-stage approach that improves fashion image retrieval through\npaired image-to-image translation. First, the Complementary Item Generation\nModel (CIGM), built on Conditional Generative Adversarial Networks (GANs),\ngenerates target item images (e.g., bottoms) from seed items (e.g., tops),\noffering conditioning signals for retrieval. These generated samples are then\nintegrated into GeCo, enhancing compatibility modeling and retrieval accuracy.\nEvaluations on three datasets show that GeCo outperforms state-of-the-art\nbaselines. Key contributions include: (i) the GeCo model utilizing paired\nimage-to-image translation within the Composed Image Retrieval framework, (ii)\ncomprehensive evaluations on benchmark datasets, and (iii) the release of a new\nFashion Taobao dataset designed for top-bottom retrieval, promoting further\nresearch.\n","authors":["Matteo Attimonelli","Claudio Pomo","Dietmar Jannach","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2408.09847v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09031v2","updated":"2024-09-24T12:30:20Z","published":"2024-03-14T01:46:56Z","title":"Projected Gradient Descent for Spectral Compressed Sensing via Symmetric\n Hankel Factorization","summary":" Current spectral compressed sensing methods via Hankel matrix completion\nemploy symmetric factorization to demonstrate the low-rank property of the\nHankel matrix. However, previous non-convex gradient methods only utilize\nasymmetric factorization to achieve spectral compressed sensing. In this paper,\nwe propose a novel nonconvex projected gradient descent method for spectral\ncompressed sensing via symmetric factorization named Symmetric Hankel Projected\nGradient Descent (SHGD), which updates only one matrix and avoids a balancing\nregularization term. SHGD reduces about half of the computation and storage\ncosts compared to the prior gradient method based on asymmetric factorization.\n{Besides, the symmetric factorization employed in our work is completely novel\nto the prior low-rank factorization model, introducing a new factorization\nambiguity under complex orthogonal transformation}. Novel distance metrics are\ndesigned for our factorization method and a linear convergence guarantee to the\ndesired signal is established with $O(r^2\\log(n))$ observations. Numerical\nsimulations demonstrate the superior performance of the proposed SHGD method in\nphase transitions and computation efficiency compared to state-of-the-art\nmethods.\n","authors":["Jinsheng Li","Wei Cui","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.09031v2.pdf","comment":"accepted in IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2307.02147v3","updated":"2024-09-24T10:42:43Z","published":"2023-07-05T09:42:51Z","title":"Recommendation Unlearning via Influence Function","summary":" Recommendation unlearning is an emerging task to serve users for erasing\nunusable data (e.g., some historical behaviors) from a well-trained recommender\nmodel. Existing methods process unlearning requests by fully or partially\nretraining the model after removing the unusable data. However, these methods\nare impractical due to the high computation cost of full retraining and the\nhighly possible performance damage of partial training. In this light, a\ndesired recommendation unlearning method should obtain a similar model as full\nretraining in a more efficient manner, i.e., achieving complete, efficient and\nharmless unlearning.\n In this work, we propose a new Influence Function-based Recommendation\nUnlearning (IFRU) framework, which efficiently updates the model without\nretraining by estimating the influence of the unusable data on the model via\nthe influence function. In the light that recent recommender models use\nhistorical data for both the constructions of the optimization loss and the\ncomputational graph (e.g., neighborhood aggregation), IFRU jointly estimates\nthe direct influence of unusable data on optimization loss and the spillover\ninfluence on the computational graph to pursue complete unlearning.\nFurthermore, we propose an importance-based pruning algorithm to reduce the\ncost of the influence function. IFRU is harmless and applicable to mainstream\ndifferentiable models. Extensive experiments demonstrate that IFRU achieves\nmore than 250 times acceleration compared to retraining-based methods with\nrecommendation performance comparable to full retraining. Codes are avaiable at\nhttps://github.com/baiyimeng/IFRU.\n","authors":["Yang Zhang","Zhiyu Hu","Yimeng Bai","Jiancan Wu","Qifan Wang","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2307.02147v3.pdf","comment":"Accepted by ACM TORS"},{"id":"http://arxiv.org/abs/2409.15933v1","updated":"2024-09-24T09:57:25Z","published":"2024-09-24T09:57:25Z","title":"SLIMER-IT: Zero-Shot NER on Italian Language","summary":" Traditional approaches to Named Entity Recognition (NER) frame the task into\na BIO sequence labeling problem. Although these systems often excel in the\ndownstream task at hand, they require extensive annotated data and struggle to\ngeneralize to out-of-distribution input domains and unseen entity types. On the\ncontrary, Large Language Models (LLMs) have demonstrated strong zero-shot\ncapabilities. While several works address Zero-Shot NER in English, little has\nbeen done in other languages. In this paper, we define an evaluation framework\nfor Zero-Shot NER, applying it to the Italian language. Furthermore, we\nintroduce SLIMER-IT, the Italian version of SLIMER, an instruction-tuning\napproach for zero-shot NER leveraging prompts enriched with definition and\nguidelines. Comparisons with other state-of-the-art models, demonstrate the\nsuperiority of SLIMER-IT on never-seen-before entity tags.\n","authors":["Andrew Zamai","Leonardo Rigutini","Marco Maggini","Andrea Zugarini"],"pdf_url":"https://arxiv.org/pdf/2409.15933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15857v1","updated":"2024-09-24T08:29:10Z","published":"2024-09-24T08:29:10Z","title":"Ducho meets Elliot: Large-scale Benchmarks for Multimodal Recommendation","summary":" In specific domains like fashion, music, and movie recommendation, the\nmulti-faceted features characterizing products and services may influence each\ncustomer on online selling platforms differently, paving the way to novel\nmultimodal recommendation models that can learn from such multimodal content.\nAccording to the literature, the common multimodal recommendation pipeline\ninvolves (i) extracting multimodal features, (ii) refining their high-level\nrepresentations to suit the recommendation task, (iii) optionally fusing all\nmultimodal features, and (iv) predicting the user-item score. While great\neffort has been put into designing optimal solutions for (ii-iv), to the best\nof our knowledge, very little attention has been devoted to exploring\nprocedures for (i). In this respect, the existing literature outlines the large\navailability of multimodal datasets and the ever-growing number of large models\naccounting for multimodal-aware tasks, but (at the same time) an unjustified\nadoption of limited standardized solutions. This motivates us to explore more\nextensive techniques for the (i) stage of the pipeline. To this end, this paper\nsettles as the first attempt to offer a large-scale benchmarking for multimodal\nrecommender systems, with a specific focus on multimodal extractors.\nSpecifically, we take advantage of two popular and recent frameworks for\nmultimodal feature extraction and reproducibility in recommendation, Ducho and\nElliot, to offer a unified and ready-to-use experimental environment able to\nrun extensive benchmarking analyses leveraging novel multimodal feature\nextractors. Results, largely validated under different hyper-parameter settings\nfor the chosen extractors, provide important insights on how to train and tune\nthe next generation of multimodal recommendation algorithms.\n","authors":["Matteo Attimonelli","Danilo Danese","Angela Di Fazio","Daniele Malitesta","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2409.15857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15828v1","updated":"2024-09-24T07:47:04Z","published":"2024-09-24T07:47:04Z","title":"Mitigating Digital Discrimination in Dating Apps -- The Dutch Breeze\n case","summary":" In September 2023, the Netherlands Institute for Human Rights, the Dutch\nnon-discrimination authority, decided that Breeze, a Dutch dating app, was\njustified in suspecting that their algorithm discriminated against non-white.\nConsequently, the Institute decided that Breeze must prevent this\ndiscrimination based on ethnicity. This paper explores two questions. (i) Is\nthe discrimination based on ethnicity in Breeze's matching algorithm illegal?\n(ii) How can dating apps mitigate or stop discrimination in their matching\nalgorithms? We illustrate the legal and technical difficulties dating apps face\nin tackling discrimination and illustrate promising solutions. We analyse the\nBreeze decision in-depth, combining insights from computer science and law. We\ndiscuss the implications of this judgment for scholarship and practice in the\nfield of fair and non-discriminatory machine learning.\n","authors":["Tim de Jonge","Frederik Zuiderveen Borgesius"],"pdf_url":"https://arxiv.org/pdf/2409.15828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15724v1","updated":"2024-09-24T04:17:21Z","published":"2024-09-24T04:17:21Z","title":"LLM-Cure: LLM-based Competitor User Review Analysis for Feature\n Enhancement","summary":" The exponential growth of the mobile app market underscores the importance of\nconstant innovation and rapid response to user demands. As user satisfaction is\nparamount to the success of a mobile application (app), developers typically\nrely on user reviews, which represent user feedback that includes ratings and\ncomments to identify areas for improvement. However, the sheer volume of user\nreviews poses challenges in manual analysis, necessitating automated\napproaches. Existing automated approaches either analyze only the target apps\nreviews, neglecting the comparison of similar features to competitors or fail\nto provide suggestions for feature enhancement. To address these gaps, we\npropose a Large Language Model (LLM)-based Competitive User Review Analysis for\nFeature Enhancement) (LLM-Cure), an approach powered by LLMs to automatically\ngenerate suggestion s for mobile app feature improvements. More specifically,\nLLM-Cure identifies and categorizes features within reviews by applying LLMs.\nWhen provided with a complaint in a user review, LLM-Cure curates highly rated\n(4 and 5 stars) reviews in competing apps related to the complaint and proposes\npotential improvements tailored to the target application. We evaluate LLM-Cure\non 1,056,739 reviews of 70 popular Android apps. Our evaluation demonstrates\nthat LLM-Cure significantly outperforms the state-of-the-art approaches in\nassigning features to reviews by up to 13% in F1-score, up to 16% in recall and\nup to 11% in precision. Additionally, LLM-Cure demonstrates its capability to\nprovide suggestions for resolving user complaints. We verify the suggestions\nusing the release notes that reflect the changes of features in the target\nmobile app. LLM-Cure achieves a promising average of 73% of the implementation\nof the provided suggestions.\n","authors":["Maram Assi","Safwat Hassan","Ying Zou"],"pdf_url":"https://arxiv.org/pdf/2409.15724v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2409.15700v1","updated":"2024-09-24T03:30:19Z","published":"2024-09-24T03:30:19Z","title":"Making Text Embedders Few-Shot Learners","summary":" Large language models (LLMs) with decoder-only architectures demonstrate\nremarkable in-context learning (ICL) capabilities. This feature enables them to\neffectively handle both familiar and novel tasks by utilizing examples provided\nwithin their input context. Recognizing the potential of this capability, we\npropose leveraging the ICL feature in LLMs to enhance the process of text\nembedding generation. To this end, we introduce a novel model bge-en-icl, which\nemploys few-shot examples to produce high-quality text embeddings. Our approach\nintegrates task-related examples directly into the query side, resulting in\nsignificant improvements across various tasks. Additionally, we have\ninvestigated how to effectively utilize LLMs as embedding models, including\nvarious attention mechanisms, pooling methods, etc. Our findings suggest that\nretaining the original framework often yields the best results, underscoring\nthat simplicity is best. Experimental results on the MTEB and AIR-Bench\nbenchmarks demonstrate that our approach sets new state-of-the-art (SOTA)\nperformance. Our model, code and dataset are freely available at\nhttps://github.com/FlagOpen/FlagEmbedding .\n","authors":["Chaofan Li","MingHao Qin","Shitao Xiao","Jianlyu Chen","Kun Luo","Yingxia Shao","Defu Lian","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2409.15700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15690v1","updated":"2024-09-24T03:06:25Z","published":"2024-09-24T03:06:25Z","title":"A Survey of Stance Detection on Social Media: New Directions and\n Perspectives","summary":" In modern digital environments, users frequently express opinions on\ncontentious topics, providing a wealth of information on prevailing attitudes.\nThe systematic analysis of these opinions offers valuable insights for\ndecision-making in various sectors, including marketing and politics. As a\nresult, stance detection has emerged as a crucial subfield within affective\ncomputing, enabling the automatic detection of user stances in social media\nconversations and providing a nuanced understanding of public sentiment on\ncomplex issues. Recent years have seen a surge of research interest in\ndeveloping effective stance detection methods, with contributions from multiple\ncommunities, including natural language processing, web science, and social\ncomputing. This paper provides a comprehensive survey of stance detection\ntechniques on social media, covering task definitions, datasets, approaches,\nand future works. We review traditional stance detection models, as well as\nstate-of-the-art methods based on large language models, and discuss their\nstrengths and limitations. Our survey highlights the importance of stance\ndetection in understanding public opinion and sentiment, and identifies gaps in\ncurrent research. We conclude by outlining potential future directions for\nstance detection on social media, including the need for more robust and\ngeneralizable models, and the importance of addressing emerging challenges such\nas multi-modal stance detection and stance detection in low-resource languages.\n","authors":["Bowen Zhang","Genan Dai","Fuqiang Niu","Nan Yin","Xiaomao Fan","Hu Huang"],"pdf_url":"https://arxiv.org/pdf/2409.15690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07597v5","updated":"2024-09-24T03:01:25Z","published":"2023-09-14T10:57:50Z","title":"C-Pack: Packed Resources For General Chinese Embeddings","summary":" We introduce C-Pack, a package of resources that significantly advance the\nfield of general Chinese embeddings. C-Pack includes three critical resources.\n1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6\ntasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated\nfrom labeled and unlabeled Chinese corpora for training embedding models. 3)\nC-TEM is a family of embedding models covering multiple sizes. Our models\noutperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the\ntime of the release. We also integrate and optimize the entire suite of\ntraining methods for C-TEM. Along with our resources on general Chinese\nembedding, we release our data and models for English text embeddings. The\nEnglish models achieve state-of-the-art performance on MTEB benchmark;\nmeanwhile, our released English data is 2 times larger than the Chinese data.\nAll these resources are made publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Niklas Muennighoff","Defu Lian","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2309.07597v5.pdf","comment":"SIGIR 2024"},{"id":"http://arxiv.org/abs/2409.06793v2","updated":"2024-09-24T02:09:10Z","published":"2024-09-10T18:02:51Z","title":"Adversarial Attacks to Multi-Modal Models","summary":" Multi-modal models have gained significant attention due to their powerful\ncapabilities. These models effectively align embeddings across diverse data\nmodalities, showcasing superior performance in downstream tasks compared to\ntheir unimodal counterparts. Recent study showed that the attacker can\nmanipulate an image or audio file by altering it in such a way that its\nembedding matches that of an attacker-chosen targeted input, thereby deceiving\ndownstream models. However, this method often underperforms due to inherent\ndisparities in data from different modalities. In this paper, we introduce\nCrossFire, an innovative approach to attack multi-modal models. CrossFire\nbegins by transforming the targeted input chosen by the attacker into a format\nthat matches the modality of the original image or audio file. We then\nformulate our attack as an optimization problem, aiming to minimize the angular\ndeviation between the embeddings of the transformed input and the modified\nimage or audio file. Solving this problem determines the perturbations to be\nadded to the original media. Our extensive experiments on six real-world\nbenchmark datasets reveal that CrossFire can significantly manipulate\ndownstream tasks, surpassing existing attacks. Additionally, we evaluate six\ndefensive strategies against CrossFire, finding that current defenses are\ninsufficient to counteract our CrossFire.\n","authors":["Zhihao Dou","Xin Hu","Haibo Yang","Zhuqing Liu","Minghong Fang"],"pdf_url":"https://arxiv.org/pdf/2409.06793v2.pdf","comment":"To appear in the ACM Workshop on Large AI Systems and Models with\n Privacy and Safety Analysis 2024 (LAMPS '24)"},{"id":"http://arxiv.org/abs/2409.15626v1","updated":"2024-09-24T00:09:41Z","published":"2024-09-24T00:09:41Z","title":"Qualitative Insights Tool (QualIT): LLM Enhanced Topic Modeling","summary":" Topic modeling is a widely used technique for uncovering thematic structures\nfrom large text corpora. However, most topic modeling approaches e.g. Latent\nDirichlet Allocation (LDA) struggle to capture nuanced semantics and contextual\nunderstanding required to accurately model complex narratives. Recent\nadvancements in this area include methods like BERTopic, which have\ndemonstrated significantly improved topic coherence and thus established a new\nstandard for benchmarking. In this paper, we present a novel approach, the\nQualitative Insights Tool (QualIT) that integrates large language models (LLMs)\nwith existing clustering-based topic modeling approaches. Our method leverages\nthe deep contextual understanding and powerful language generation capabilities\nof LLMs to enrich the topic modeling process using clustering. We evaluate our\napproach on a large corpus of news articles and demonstrate substantial\nimprovements in topic coherence and topic diversity compared to baseline topic\nmodeling techniques. On the 20 ground-truth topics, our method shows 70% topic\ncoherence (vs 65% & 57% benchmarks) and 95.5% topic diversity (vs 85% & 72%\nbenchmarks). Our findings suggest that the integration of LLMs can unlock new\nopportunities for topic modeling of dynamic and complex text data, as is common\nin talent management research contexts.\n","authors":["Satya Kapoor","Alex Gil","Sreyoshi Bhaduri","Anshul Mittal","Rutu Mulkar"],"pdf_url":"https://arxiv.org/pdf/2409.15626v1.pdf","comment":"6 pages, 4 tables, 1 figure"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.16404v1","updated":"2024-09-24T19:06:18Z","published":"2024-09-24T19:06:18Z","title":"FastTalker: Jointly Generating Speech and Conversational Gestures from\n Text","summary":" Generating 3D human gestures and speech from a text script is critical for\ncreating realistic talking avatars. One solution is to leverage separate\npipelines for text-to-speech (TTS) and speech-to-gesture (STG), but this\napproach suffers from poor alignment of speech and gestures and slow inference\ntimes. In this paper, we introduce FastTalker, an efficient and effective\nframework that simultaneously generates high-quality speech audio and 3D human\ngestures at high inference speeds. Our key insight is reusing the intermediate\nfeatures from speech synthesis for gesture generation, as these features\ncontain more precise rhythmic information than features re-extracted from\ngenerated speech. Specifically, 1) we propose an end-to-end framework that\nconcurrently generates speech waveforms and full-body gestures, using\nintermediate speech features such as pitch, onset, energy, and duration\ndirectly for gesture decoding; 2) we redesign the causal network architecture\nto eliminate dependencies on future inputs for real applications; 3) we employ\nReinforcement Learning-based Neural Architecture Search (NAS) to enhance both\nperformance and inference speed by optimizing our network architecture.\nExperimental results on the BEAT2 dataset demonstrate that FastTalker achieves\nstate-of-the-art performance in both speech synthesis and gesture generation,\nprocessing speech and gestures in 0.17 seconds per second on an NVIDIA 3090.\n","authors":["Zixin Guo","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.16404v1.pdf","comment":"European Conference on Computer Vision Workshop"},{"id":"http://arxiv.org/abs/2409.16136v1","updated":"2024-09-24T14:43:14Z","published":"2024-09-24T14:43:14Z","title":"HA-FGOVD: Highlighting Fine-grained Attributes via Explicit Linear\n Composition for Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVD) models are considered to be Large\nMulti-modal Models (LMM), due to their extensive training data and a large\nnumber of parameters. Mainstream OVD models prioritize object coarse-grained\ncategory rather than focus on their fine-grained attributes, e.g., colors or\nmaterials, thus failed to identify objects specified with certain attributes.\nHowever, OVD models are pretrained on large-scale image-text pairs with rich\nattribute words, whose latent feature space can represent the global text\nfeature as a linear composition of fine-grained attribute tokens without\nhighlighting them. Therefore, we propose in this paper a universal and explicit\napproach for frozen mainstream OVD models that boosts their attribute-level\ndetection capabilities by highlighting fine-grained attributes in explicit\nlinear space. Firstly, a LLM is leveraged to highlight attribute words within\nthe input text as a zero-shot prompted task. Secondly, by strategically\nadjusting the token masks, the text encoders of OVD models extract both global\ntext and attribute-specific features, which are then explicitly composited as\ntwo vectors in linear space to form the new attribute-highlighted feature for\ndetection tasks, where corresponding scalars are hand-crafted or learned to\nreweight both two vectors. Notably, these scalars can be seamlessly transferred\namong different OVD models, which proves that such an explicit linear\ncomposition is universal. Empirical evaluation on the FG-OVD dataset\ndemonstrates that our proposed method uniformly improves fine-grained\nattribute-level OVD of various mainstream models and achieves new\nstate-of-the-art performance.\n","authors":["Yuqi Ma","Mengyin Liu","Chao Zhu","Xu-Cheng Yin"],"pdf_url":"https://arxiv.org/pdf/2409.16136v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.15813v1","updated":"2024-09-24T07:19:30Z","published":"2024-09-24T07:19:30Z","title":"Layer-wise Model Merging for Unsupervised Domain Adaptation in\n Segmentation Tasks","summary":" Merging parameters of multiple models has resurfaced as an effective strategy\nto enhance task performance and robustness, but prior work is limited by the\nhigh costs of ensemble creation and inference. In this paper, we leverage the\nabundance of freely accessible trained models to introduce a cost-free approach\nto model merging. It focuses on a layer-wise integration of merged models,\naiming to maintain the distinctiveness of the task-specific final layers while\nunifying the initial layers, which are primarily associated with feature\nextraction. This approach ensures parameter consistency across all layers,\nessential for boosting performance. Moreover, it facilitates seamless\nintegration of knowledge, enabling effective merging of models from different\ndatasets and tasks. Specifically, we investigate its applicability in\nUnsupervised Domain Adaptation (UDA), an unexplored area for model merging, for\nSemantic and Panoptic Segmentation. Experimental results demonstrate\nsubstantial UDA improvements without additional costs for merging\nsame-architecture models from distinct datasets ($\\uparrow 2.6\\%$ mIoU) and\ndifferent-architecture models with a shared backbone ($\\uparrow 6.8\\%$ mIoU).\nFurthermore, merging Semantic and Panoptic Segmentation models increases mPQ by\n$\\uparrow 7\\%$. These findings are validated across a wide variety of UDA\nstrategies, architectures, and datasets.\n","authors":["Roberto Alcover-Couso","Juan C. SanMiguel","Marcos Escudero-Viñolo","Jose M Martínez"],"pdf_url":"https://arxiv.org/pdf/2409.15813v1.pdf","comment":null}]},"2024-09-23T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.15576v1","updated":"2024-09-23T22:23:08Z","published":"2024-09-23T22:23:08Z","title":"Optimizing News Text Classification with Bi-LSTM and Attention Mechanism\n for Efficient Data Processing","summary":" The development of Internet technology has led to a rapid increase in news\ninformation. Filtering out valuable content from complex information has become\nan urgentproblem that needs to be solved. In view of the shortcomings of\ntraditional manual classification methods that are time-consuming and\ninefficient, this paper proposes an automaticclassification scheme for news\ntexts based on deep learning. This solution achieves efficient classification\nand management of news texts by introducing advanced machine learning\nalgorithms, especially an optimization model that combines Bi-directional Long\nShort-Term Memory Network (Bi-LSTM) and Attention Mechanism. Experimental\nresults show that this solution can not only significantly improve the accuracy\nand timeliness of classification, but also significantly reduce the need for\nmanual intervention. It has important practical significance for improving the\ninformation processing capabilities of the news industry and accelerating the\nspeed of information flow. Through comparative analysis of multiple common\nmodels, the effectiveness and advancement of the proposed method are proved,\nlaying a solid foundation for future news text classification research.\n","authors":["Bingyao Liu","Jiajing Chen","Rui Wang","Junming Huang","Yuanshuai Luo","Jianjun Wei"],"pdf_url":"https://arxiv.org/pdf/2409.15576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15568v1","updated":"2024-09-23T21:50:35Z","published":"2024-09-23T21:50:35Z","title":"Cross-Domain Latent Factors Sharing via Implicit Matrix Factorization","summary":" Data sparsity has been one of the long-standing problems for recommender\nsystems. One of the solutions to mitigate this issue is to exploit knowledge\navailable in other source domains. However, many cross-domain recommender\nsystems introduce a complex architecture that makes them less scalable in\npractice. On the other hand, matrix factorization methods are still considered\nto be strong baselines for single-domain recommendations. In this paper, we\nintroduce the CDIMF, a model that extends the standard implicit matrix\nfactorization with ALS to cross-domain scenarios. We apply the Alternating\nDirection Method of Multipliers to learn shared latent factors for overlapped\nusers while factorizing the interaction matrix. In a dual-domain setting,\nexperiments on industrial datasets demonstrate a competing performance of CDIMF\nfor both cold-start and warm-start. The proposed model can outperform most\nother recent cross-domain and single-domain models. We also provide the code to\nreproduce experiments on GitHub.\n","authors":["Abdulaziz Samra","Evgeney Frolov","Alexey Vasilev","Alexander Grigorievskiy","Anton Vakhrushev"],"pdf_url":"https://arxiv.org/pdf/2409.15568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15558v1","updated":"2024-09-23T21:29:03Z","published":"2024-09-23T21:29:03Z","title":"Stalactite: Toolbox for Fast Prototyping of Vertical Federated Learning\n Systems","summary":" Machine learning (ML) models trained on datasets owned by different\norganizations and physically located in remote databases offer benefits in many\nreal-world use cases. State regulations or business requirements often prevent\ndata transfer to a central location, making it difficult to utilize standard\nmachine learning algorithms. Federated Learning (FL) is a technique that\nenables models to learn from distributed datasets without revealing the\noriginal data. Vertical Federated learning (VFL) is a type of FL where data\nsamples are divided by features across several data owners. For instance, in a\nrecommendation task, a user can interact with various sets of items, and the\nlogs of these interactions are stored by different organizations. In this demo\npaper, we present \\emph{Stalactite} - an open-source framework for VFL that\nprovides the necessary functionality for building prototypes of VFL systems. It\nhas several advantages over the existing frameworks. In particular, it allows\nresearchers to focus on the algorithmic side rather than engineering and to\neasily deploy learning in a distributed environment. It implements several VFL\nalgorithms and has a built-in homomorphic encryption layer. We demonstrate its\nuse on a real-world recommendation datasets.\n","authors":["Anastasiia Zakharova","Dmitriy Alexandrov","Maria Khodorchenko","Nikolay Butakov","Alexey Vasilev","Maxim Savchenko","Alexander Grigorievskiy"],"pdf_url":"https://arxiv.org/pdf/2409.15558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15260v1","updated":"2024-09-23T17:56:08Z","published":"2024-09-23T17:56:08Z","title":"Generative AI Is Not Ready for Clinical Use in Patient Education for\n Lower Back Pain Patients, Even With Retrieval-Augmented Generation","summary":" Low back pain (LBP) is a leading cause of disability globally. Following the\nonset of LBP and subsequent treatment, adequate patient education is crucial\nfor improving functionality and long-term outcomes. Despite advancements in\npatient education strategies, significant gaps persist in delivering\npersonalized, evidence-based information to patients with LBP. Recent\nadvancements in large language models (LLMs) and generative artificial\nintelligence (GenAI) have demonstrated the potential to enhance patient\neducation. However, their application and efficacy in delivering educational\ncontent to patients with LBP remain underexplored and warrant further\ninvestigation. In this study, we introduce a novel approach utilizing LLMs with\nRetrieval-Augmented Generation (RAG) and few-shot learning to generate tailored\neducational materials for patients with LBP. Physical therapists manually\nevaluated our model responses for redundancy, accuracy, and completeness using\na Likert scale. In addition, the readability of the generated education\nmaterials is assessed using the Flesch Reading Ease score. The findings\ndemonstrate that RAG-based LLMs outperform traditional LLMs, providing more\naccurate, complete, and readable patient education materials with less\nredundancy. Having said that, our analysis reveals that the generated materials\nare not yet ready for use in clinical practice. This study underscores the\npotential of AI-driven models utilizing RAG to improve patient education for\nLBP; however, significant challenges remain in ensuring the clinical relevance\nand granularity of content generated by these models.\n","authors":["Yi-Fei Zhao","Allyn Bove","David Thompson","James Hill","Yi Xu","Yufan Ren","Andrea Hassman","Leming Zhou","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04548v2","updated":"2024-09-23T17:38:54Z","published":"2024-06-06T23:09:54Z","title":"GNNAnatomy: Systematic Generation and Evaluation of Multi-Level\n Explanations for Graph Neural Networks","summary":" Graph Neural Networks (GNNs) excel in machine learning tasks involving\ngraphs, such as node classification, graph classification, and link prediction.\nHowever, explaining their decision-making process is challenging due to the\ncomplex transformations GNNs perform by aggregating relational information from\ngraph topology. Existing methods for explaining GNNs face key limitations: (1)\nlack of flexibility in generating explanations at varying levels, (2)\ndifficulty in identifying unique substructures relevant to class\ndifferentiation, and (3) little support to ensure the trustworthiness of\nexplanations. To address these challenges, we introduce GNNAnatomy, a visual\nanalytics system designed to generate and evaluate multi-level GNN explanations\nfor graph classification tasks. GNNAnatomy uses graphlets, primitive graph\nsubstructures, to identify the most critical substructures in a graph class by\nanalyzing the correlation between GNN predictions and graphlet frequencies.\nThese correlations are presented interactively for user-selected group of\ngraphs through our visual analytics system. To further validate top-ranked\ngraphlets, we measure the change in classification confidence after removing\neach graphlet from the original graph. We demonstrate the effectiveness of\nGNNAnatomy through case studies on synthetic and real-world graph datasets from\nsociology and biology domains. Additionally, we compare GNNAnatomy with\nstate-of-the-art explainable GNN methods to showcase its utility and\nversatility.\n","authors":["Hsiao-Ying Lu","Yiran Li","Ujwal Pratap Krishna Kaluvakolanu Thyagarajan","Kwan-Liu Ma"],"pdf_url":"https://arxiv.org/pdf/2406.04548v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15163v1","updated":"2024-09-23T16:16:08Z","published":"2024-09-23T16:16:08Z","title":"Lessons Learned on Information Retrieval in Electronic Health Records: A\n Comparison of Embedding Models and Pooling Strategies","summary":" Objective: Applying large language models (LLMs) to the clinical domain is\nchallenging due to the context-heavy nature of processing medical records.\nRetrieval-augmented generation (RAG) offers a solution by facilitating\nreasoning over large text sources. However, there are many parameters to\noptimize in just the retrieval system alone. This paper presents an ablation\nstudy exploring how different embedding models and pooling methods affect\ninformation retrieval for the clinical domain.\n Methods: Evaluating on three retrieval tasks on two electronic health record\n(EHR) data sources, we compared seven models, including medical- and\ngeneral-domain models, specialized encoder embedding models, and off-the-shelf\ndecoder LLMs. We also examine the choice of embedding pooling strategy for each\nmodel, independently on the query and the text to retrieve.\n Results: We found that the choice of embedding model significantly impacts\nretrieval performance, with BGE, a comparatively small general-domain model,\nconsistently outperforming all others, including medical-specific models.\nHowever, our findings also revealed substantial variability across datasets and\nquery text phrasings. We also determined the best pooling methods for each of\nthese models to guide future design of retrieval systems.\n Discussion: The choice of embedding model, pooling strategy, and query\nformulation can significantly impact retrieval performance and the performance\nof these models on other public benchmarks does not necessarily transfer to new\ndomains. Further studies such as this one are vital for guiding\nempirically-grounded development of retrieval frameworks, such as in the\ncontext of RAG, for the clinical domain.\n","authors":["Skatje Myers","Timothy A. Miller","Yanjun Gao","Matthew M. Churpek","Anoop Mayampurath","Dmitriy Dligach","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2409.15163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15133v1","updated":"2024-09-23T15:38:12Z","published":"2024-09-23T15:38:12Z","title":"Don't Use LLMs to Make Relevance Judgments","summary":" Making the relevance judgments for a TREC-style test collection can be\ncomplex and expensive. A typical TREC track usually involves a team of six\ncontractors working for 2-4 weeks. Those contractors need to be trained and\nmonitored. Software has to be written to support recording relevance judgments\ncorrectly and efficiently. The recent advent of large language models that\nproduce astoundingly human-like flowing text output in response to a natural\nlanguage prompt has inspired IR researchers to wonder how those models might be\nused in the relevance judgment collection process. At the ACM SIGIR 2024\nconference, a workshop ``LLM4Eval'' provided a venue for this work, and\nfeatured a data challenge activity where participants reproduced TREC deep\nlearning track judgments, as was done by Thomas et al (arXiv:2408.08896,\narXiv:2309.10621). I was asked to give a keynote at the workshop, and this\npaper presents that keynote in article form. The bottom-line-up-front message\nis, don't use LLMs to create relevance judgments for TREC-style evaluations.\n","authors":["Ian Soboroff"],"pdf_url":"https://arxiv.org/pdf/2409.15133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15060v1","updated":"2024-09-23T14:35:06Z","published":"2024-09-23T14:35:06Z","title":"EMERS: Energy Meter for Recommender Systems","summary":" Due to recent advancements in machine learning, recommender systems use\nincreasingly more energy for training, evaluation, and deployment. However, the\nrecommender systems community often does not report the energy consumption of\ntheir experiments. In today's research landscape, no tools exist to easily\nmeasure the energy consumption of recommender systems experiments. To bridge\nthis gap, we introduce EMERS, the first software library that simplifies\nmeasuring, monitoring, recording, and sharing the energy consumption of\nrecommender systems experiments. EMERS measures energy consumption with smart\npower plugs and offers a user interface to monitor and compare the energy\nconsumption of recommender systems experiments. Thereby, EMERS improves\nsustainability awareness and simplifies self-reporting energy consumption for\nrecommender systems practitioners and researchers.\n","authors":["Lukas Wegmeth","Tobias Vente","Alan Said","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2409.15060v1.pdf","comment":"Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM\n Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2409.15004v1","updated":"2024-09-23T13:28:06Z","published":"2024-09-23T13:28:06Z","title":"ViBERTgrid BiLSTM-CRF: Multimodal Key Information Extraction from\n Unstructured Financial Documents","summary":" Multimodal key information extraction (KIE) models have been studied\nextensively on semi-structured documents. However, their investigation on\nunstructured documents is an emerging research topic. The paper presents an\napproach to adapt a multimodal transformer (i.e., ViBERTgrid previously\nexplored on semi-structured documents) for unstructured financial documents, by\nincorporating a BiLSTM-CRF layer. The proposed ViBERTgrid BiLSTM-CRF model\ndemonstrates a significant improvement in performance (up to 2 percentage\npoints) on named entity recognition from unstructured documents in financial\ndomain, while maintaining its KIE performance on semi-structured documents. As\nan additional contribution, we publicly released token-level annotations for\nthe SROIE dataset in order to pave the way for its use in multimodal sequence\nlabeling models.\n","authors":["Furkan Pala","Mehmet Yasin Akpınar","Onur Deniz","Gülşen Eryiğit"],"pdf_url":"https://arxiv.org/pdf/2409.15004v1.pdf","comment":"Accepted in MIDAS (The 8th Workshop on MIning DAta for financial\n applicationS) workshop of ECML PKDD 2023 conference"},{"id":"http://arxiv.org/abs/2409.14945v1","updated":"2024-09-23T12:02:23Z","published":"2024-09-23T12:02:23Z","title":"Adaptive Learning on User Segmentation: Universal to Specific\n Representation via Bipartite Neural Interaction","summary":" Recently, models for user representation learning have been widely applied in\nclick-through-rate (CTR) and conversion-rate (CVR) prediction. Usually, the\nmodel learns a universal user representation as the input for subsequent\nscenario-specific models. However, in numerous industrial applications (e.g.,\nrecommendation and marketing), the business always operates such applications\nas various online activities among different user segmentation. These\nsegmentation are always created by domain experts. Due to the difference in\nuser distribution (i.e., user segmentation) and business objectives in\nsubsequent tasks, learning solely on universal representation may lead to\ndetrimental effects on both model performance and robustness. In this paper, we\npropose a novel learning framework that can first learn general universal user\nrepresentation through information bottleneck. Then, merge and learn a\nsegmentation-specific or a task-specific representation through neural\ninteraction. We design the interactive learning process by leveraging a\nbipartite graph architecture to model the representation learning and merging\nbetween contextual clusters and each user segmentation. Our proposed method is\nevaluated in two open-source benchmarks, two offline business datasets, and\ndeployed on two online marketing applications to predict users' CVR. The\nresults demonstrate that our method can achieve superior performance and\nsurpass the baseline methods.\n","authors":["Xiaoyu Tan","Yongxin Deng","Chao Qu","Siqiao Xue","Xiaoming Shi","James Zhang","Xihe Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.14945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14872v1","updated":"2024-09-23T10:10:24Z","published":"2024-09-23T10:10:24Z","title":"FedSlate:A Federated Deep Reinforcement Learning Recommender System","summary":" Reinforcement learning methods have been used to optimize long-term user\nengagement in recommendation systems. However, existing reinforcement\nlearning-based recommendation systems do not fully exploit the relevance of\nindividual user behavior across different platforms. One potential solution is\nto aggregate data from various platforms in a centralized location and use the\naggregated data for training. However, this approach raises economic and legal\nconcerns, including increased communication costs and potential threats to user\nprivacy. To address these challenges, we propose \\textbf{FedSlate}, a federated\nreinforcement learning recommendation algorithm that effectively utilizes\ninformation that is prohibited from being shared at a legal level. We employ\nthe SlateQ algorithm to assist FedSlate in learning users' long-term behavior\nand evaluating the value of recommended content. We extend the existing\napplication scope of recommendation systems from single-user single-platform to\nsingle-user multi-platform and address cross-platform learning challenges by\nintroducing federated learning. We use RecSim to construct a simulation\nenvironment for evaluating FedSlate and compare its performance with\nstate-of-the-art benchmark recommendation models. Experimental results\ndemonstrate the superior effects of FedSlate over baseline methods in various\nenvironmental settings, and FedSlate facilitates the learning of recommendation\nstrategies in scenarios where baseline methods are completely inapplicable.\nCode is available at \\textit{https://github.com/TianYaDY/FedSlate}.\n","authors":["Yongxin Deng","Xiaoyu Tan","Xihe Qiu","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2409.14872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14810v1","updated":"2024-09-23T08:39:07Z","published":"2024-09-23T08:39:07Z","title":"Pre-trained Language Model and Knowledge Distillation for Lightweight\n Sequential Recommendation","summary":" Sequential recommendation models user interests based on historical behaviors\nto provide personalized recommendation. Previous sequential recommendation\nalgorithms primarily employ neural networks to extract features of user\ninterests, achieving good performance. However, due to the recommendation\nsystem datasets sparsity, these algorithms often employ small-scale network\nframeworks, resulting in weaker generalization capability. Recently, a series\nof sequential recommendation algorithms based on large pre-trained language\nmodels have been proposed. Nonetheless, given the real-time demands of\nrecommendation systems, the challenge remains in applying pre-trained language\nmodels for rapid recommendations in real scenarios. To address this, we propose\na sequential recommendation algorithm based on a pre-trained language model and\nknowledge distillation. The key of proposed algorithm is to transfer\npre-trained knowledge across domains and achieve lightweight inference by\nknowledge distillation. The algorithm operates in two stages: in the first\nstage, we fine-tune the pre-trained language model on the recommendation\ndataset to transfer the pre-trained knowledge to the recommendation task; in\nthe second stage, we distill the trained language model to transfer the learned\nknowledge to a lightweight model. Extensive experiments on multiple public\nrecommendation datasets show that the proposed algorithm enhances\nrecommendation accuracy and provide timely recommendation services.\n","authors":["Li Li","Mingyue Cheng","Zhiding Liu","Hao Zhang","Qi Liu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.14810v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2409.14689v1","updated":"2024-09-23T03:23:20Z","published":"2024-09-23T03:23:20Z","title":"EDGE-Rec: Efficient and Data-Guided Edge Diffusion For Recommender\n Systems Graphs","summary":" Most recommender systems research focuses on binary historical user-item\ninteraction encodings to predict future interactions. User features, item\nfeatures, and interaction strengths remain largely under-utilized in this space\nor only indirectly utilized, despite proving largely effective in large-scale\nproduction recommendation systems. We propose a new attention mechanism,\nloosely based on the principles of collaborative filtering, called Row-Column\nSeparable Attention RCSA to take advantage of real-valued interaction weights\nas well as user and item features directly. Building on this mechanism, we\nadditionally propose a novel Graph Diffusion Transformer GDiT architecture\nwhich is trained to iteratively denoise the weighted interaction matrix of the\nuser-item interaction graph directly. The weighted interaction matrix is built\nfrom the bipartite structure of the user-item interaction graph and\ncorresponding edge weights derived from user-item rating interactions. Inspired\nby the recent progress in text-conditioned image generation, our method\ndirectly produces user-item rating predictions on the same scale as the\noriginal ratings by conditioning the denoising process on user and item\nfeatures with a principled approach.\n","authors":["Utkarsh Priyam","Hemit Shah","Edoardo Botta"],"pdf_url":"https://arxiv.org/pdf/2409.14689v1.pdf","comment":"6 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.14683v1","updated":"2024-09-23T03:12:43Z","published":"2024-09-23T03:12:43Z","title":"Reducing the Footprint of Multi-Vector Retrieval with Minimal\n Performance Impact via Token Pooling","summary":" Over the last few years, multi-vector retrieval methods, spearheaded by\nColBERT, have become an increasingly popular approach to Neural IR. By storing\nrepresentations at the token level rather than at the document level, these\nmethods have demonstrated very strong retrieval performance, especially in\nout-of-domain settings. However, the storage and memory requirements necessary\nto store the large number of associated vectors remain an important drawback,\nhindering practical adoption. In this paper, we introduce a simple\nclustering-based token pooling approach to aggressively reduce the number of\nvectors that need to be stored. This method can reduce the space & memory\nfootprint of ColBERT indexes by 50% with virtually no retrieval performance\ndegradation. This method also allows for further reductions, reducing the\nvector count by 66%-to-75% , with degradation remaining below 5% on a vast\nmajority of datasets. Importantly, this approach requires no architectural\nchange nor query-time processing, and can be used as a simple drop-in during\nindexation with any ColBERT-like model.\n","authors":["Benjamin Clavié","Antoine Chaffin","Griffin Adams"],"pdf_url":"https://arxiv.org/pdf/2409.14683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14682v1","updated":"2024-09-23T03:12:33Z","published":"2024-09-23T03:12:33Z","title":"Robust Training Objectives Improve Embedding-based Retrieval in\n Industrial Recommendation Systems","summary":" Improving recommendation systems (RS) can greatly enhance the user experience\nacross many domains, such as social media. Many RS utilize embedding-based\nretrieval (EBR) approaches to retrieve candidates for recommendation. In an EBR\nsystem, the embedding quality is key. According to recent literature,\nself-supervised multitask learning (SSMTL) has showed strong performance on\nacademic benchmarks in embedding learning and resulted in an overall\nimprovement in multiple downstream tasks, demonstrating a larger resilience to\nthe adverse conditions between each downstream task and thereby increased\nrobustness and task generalization ability through the training objective.\nHowever, whether or not the success of SSMTL in academia as a robust training\nobjectives translates to large-scale (i.e., over hundreds of million users and\ninteractions in-between) industrial RS still requires verification. Simply\nadopting academic setups in industrial RS might entail two issues. Firstly,\nmany self-supervised objectives require data augmentations (e.g., embedding\nmasking/corruption) over a large portion of users and items, which is\nprohibitively expensive in industrial RS. Furthermore, some self-supervised\nobjectives might not align with the recommendation task, which might lead to\nredundant computational overheads or negative transfer. In light of these two\nchallenges, we evaluate using a robust training objective, specifically SSMTL,\nthrough a large-scale friend recommendation system on a social media platform\nin the tech sector, identifying whether this increase in robustness can work at\nscale in enhancing retrieval in the production setting. Through online A/B\ntesting with SSMTL-based EBR, we observe statistically significant increases in\nkey metrics in the friend recommendations, with up to 5.45% improvements in new\nfriends made and 1.91% improvements in new friends made with cold-start users.\n","authors":["Matthew Kolodner","Mingxuan Ju","Zihao Fan","Tong Zhao","Elham Ghazizadeh","Yan Wu","Neil Shah","Yozen Liu"],"pdf_url":"https://arxiv.org/pdf/2409.14682v1.pdf","comment":"RobustRecSys workshop @ RecSys 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.15551v1","updated":"2024-09-23T21:07:06Z","published":"2024-09-23T21:07:06Z","title":"Revise, Reason, and Recognize: LLM-Based Emotion Recognition via\n Emotion-Specific Prompts and ASR Error Correction","summary":" Annotating and recognizing speech emotion using prompt engineering has\nrecently emerged with the advancement of Large Language Models (LLMs), yet its\nefficacy and reliability remain questionable. In this paper, we conduct a\nsystematic study on this topic, beginning with the proposal of novel prompts\nthat incorporate emotion-specific knowledge from acoustics, linguistics, and\npsychology. Subsequently, we examine the effectiveness of LLM-based prompting\non Automatic Speech Recognition (ASR) transcription, contrasting it with\nground-truth transcription. Furthermore, we propose a Revise-Reason-Recognize\nprompting pipeline for robust LLM-based emotion recognition from spoken\nlanguage with ASR errors. Additionally, experiments on context-aware learning,\nin-context learning, and instruction tuning are performed to examine the\nusefulness of LLM training schemes in this direction. Finally, we investigate\nthe sensitivity of LLMs to minor prompt variations. Experimental results\ndemonstrate the efficacy of the emotion-specific prompts, ASR error correction,\nand LLM training schemes for LLM-based emotion recognition. Our study aims to\nrefine the use of LLMs in emotion recognition and related domains.\n","authors":["Yuanchao Li","Yuan Gong","Chao-Han Huck Yang","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2409.15551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15545v1","updated":"2024-09-23T20:59:15Z","published":"2024-09-23T20:59:15Z","title":"Rethinking Emotion Bias in Music via Frechet Audio Distance","summary":" The subjective nature of music emotion introduces inherent bias in both\nrecognition and generation, especially when relying on a single audio encoder,\nemotion classifier, or evaluation metric. In this work, we conduct a study on\nMusic Emotion Recognition (MER) and Emotional Music Generation (EMG), employing\ndiverse audio encoders alongside the Frechet Audio Distance (FAD), a\nreference-free evaluation metric. Our study begins with a benchmark evaluation\nof MER, highlighting the limitations associated with using a single audio\nencoder and the disparities observed across different measurements. We then\npropose assessing MER performance using FAD from multiple encoders to provide a\nmore objective measure of music emotion. Furthermore, we introduce an enhanced\nEMG approach designed to improve both the variation and prominence of generated\nmusic emotion, thus enhancing realism. Additionally, we investigate the realism\ndisparities between the emotions conveyed in real and synthetic music,\ncomparing our EMG model against two baseline models. Experimental results\nunderscore the emotion bias problem in both MER and EMG and demonstrate the\npotential of using FAD and diverse audio encoders to evaluate music emotion\nobjectively.\n","authors":["Yuanchao Li","Azalea Gui","Dimitra Emmanouilidou","Hannes Gamper"],"pdf_url":"https://arxiv.org/pdf/2409.15545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15157v1","updated":"2024-09-23T16:04:50Z","published":"2024-09-23T16:04:50Z","title":"LoVA: Long-form Video-to-Audio Generation","summary":" Video-to-audio (V2A) generation is important for video editing and\npost-processing, enabling the creation of semantics-aligned audio for silent\nvideo. However, most existing methods focus on generating short-form audio for\nshort video segment (less than 10 seconds), while giving little attention to\nthe scenario of long-form video inputs. For current UNet-based diffusion V2A\nmodels, an inevitable problem when handling long-form audio generation is the\ninconsistencies within the final concatenated audio. In this paper, we first\nhighlight the importance of long-form V2A problem. Besides, we propose LoVA, a\nnovel model for Long-form Video-to-Audio generation. Based on the Diffusion\nTransformer (DiT) architecture, LoVA proves to be more effective at generating\nlong-form audio compared to existing autoregressive models and UNet-based\ndiffusion models. Extensive objective and subjective experiments demonstrate\nthat LoVA achieves comparable performance on 10-second V2A benchmark and\noutperforms all other baselines on a benchmark with long-form video input.\n","authors":["Xin Cheng","Xihua Wang","Yihan Wu","Yuyue Wang","Ruihua Song"],"pdf_url":"https://arxiv.org/pdf/2409.15157v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.14925v1","updated":"2024-09-23T11:20:44Z","published":"2024-09-23T11:20:44Z","title":"DanceCamAnimator: Keyframe-Based Controllable 3D Dance Camera Synthesis","summary":" Synthesizing camera movements from music and dance is highly challenging due\nto the contradicting requirements and complexities of dance cinematography.\nUnlike human movements, which are always continuous, dance camera movements\ninvolve both continuous sequences of variable lengths and sudden drastic\nchanges to simulate the switching of multiple cameras. However, in previous\nworks, every camera frame is equally treated and this causes jittering and\nunavoidable smoothing in post-processing. To solve these problems, we propose\nto integrate animator dance cinematography knowledge by formulating this task\nas a three-stage process: keyframe detection, keyframe synthesis, and tween\nfunction prediction. Following this formulation, we design a novel end-to-end\ndance camera synthesis framework \\textbf{DanceCamAnimator}, which imitates\nhuman animation procedures and shows powerful keyframe-based controllability\nwith variable lengths. Extensive experiments on the DCM dataset demonstrate\nthat our method surpasses previous baselines quantitatively and qualitatively.\nCode will be available at\n\\url{https://github.com/Carmenw1203/DanceCamAnimator-Official}.\n","authors":["Zixuan Wang","Jiayi Li","Xiaoyu Qin","Shikun Sun","Songtao Zhou","Jia Jia","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2409.14925v1.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2409.14829v1","updated":"2024-09-23T08:59:55Z","published":"2024-09-23T08:59:55Z","title":"RoWSFormer: A Robust Watermarking Framework with Swin Transformer for\n Enhanced Geometric Attack Resilience","summary":" In recent years, digital watermarking techniques based on deep learning have\nbeen widely studied. To achieve both imperceptibility and robustness of image\nwatermarks, most current methods employ convolutional neural networks to build\nrobust watermarking frameworks. However, despite the success of CNN-based\nwatermarking models, they struggle to achieve robustness against geometric\nattacks due to the limitations of convolutional neural networks in capturing\nglobal and long-range relationships. To address this limitation, we propose a\nrobust watermarking framework based on the Swin Transformer, named RoWSFormer.\nSpecifically, we design the Locally-Channel Enhanced Swin Transformer Block as\nthe core of both the encoder and decoder. This block utilizes the\nself-attention mechanism to capture global and long-range information, thereby\nsignificantly improving adaptation to geometric distortions. Additionally, we\nconstruct the Frequency-Enhanced Transformer Block to extract frequency domain\ninformation, which further strengthens the robustness of the watermarking\nframework. Experimental results demonstrate that our RoWSFormer surpasses\nexisting state-of-the-art watermarking methods. For most non-geometric attacks,\nRoWSFormer improves the PSNR by 3 dB while maintaining the same extraction\naccuracy. In the case of geometric attacks (such as rotation, scaling, and\naffine transformations), RoWSFormer achieves over a 6 dB improvement in PSNR,\nwith extraction accuracy exceeding 97\\%.\n","authors":["Weitong Chen","Yuheng Li"],"pdf_url":"https://arxiv.org/pdf/2409.14829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14827v1","updated":"2024-09-23T08:59:22Z","published":"2024-09-23T08:59:22Z","title":"AIM 2024 Challenge on Video Saliency Prediction: Methods and Results","summary":" This paper reviews the Challenge on Video Saliency Prediction at AIM 2024.\nThe goal of the participants was to develop a method for predicting accurate\nsaliency maps for the provided set of video sequences. Saliency maps are widely\nexploited in various applications, including video compression, quality\nassessment, visual perception studies, the advertising industry, etc. For this\ncompetition, a previously unused large-scale audio-visual mouse saliency\n(AViMoS) dataset of 1500 videos with more than 70 observers per video was\ncollected using crowdsourced mouse tracking. The dataset collection methodology\nhas been validated using conventional eye-tracking data and has shown high\nconsistency. Over 30 teams registered in the challenge, and there are 7 teams\nthat submitted the results in the final phase. The final phase solutions were\ntested and ranked by commonly used quality metrics on a private test subset.\nThe results of this evaluation and the descriptions of the solutions are\npresented in this report. All data, including the private test subset, is made\npublicly available on the challenge homepage -\nhttps://challenges.videoprocessing.ai/challenges/video-saliency-prediction.html.\n","authors":["Andrey Moskalenko","Alexey Bryncev","Dmitry Vatolin","Radu Timofte","Gen Zhan","Li Yang","Yunlong Tang","Yiting Liao","Jiongzhi Lin","Baitao Huang","Morteza Moradi","Mohammad Moradi","Francesco Rundo","Concetto Spampinato","Ali Borji","Simone Palazzo","Yuxin Zhu","Yinan Sun","Huiyu Duan","Yuqin Cao","Ziheng Jia","Qiang Hu","Xiongkuo Min","Guangtao Zhai","Hao Fang","Runmin Cong","Xiankai Lu","Xiaofei Zhou","Wei Zhang","Chunyu Zhao","Wentao Mu","Tao Deng","Hamed R. Tavakoli"],"pdf_url":"https://arxiv.org/pdf/2409.14827v1.pdf","comment":"ECCVW 2024"},{"id":"http://arxiv.org/abs/2311.05920v3","updated":"2024-09-23T05:50:41Z","published":"2023-11-10T08:09:42Z","title":"Feeding the Crave: How People with Eating Disorders Get Trapped in the\n Perpetual Cycle of Digital Food Content","summary":" Recent studies have examined how digital food content impacts viewers'\ndietary health. A few have found that individuals with eating disorders are\nparticularly sensitive to digital food content, such as eating and cooking\nvideos, which contribute to disordered eating behaviors. However, there is a\nlack of comprehensive studies that investigate how these individuals interact\nwith various digital food content. To fill this gap, we conducted two rounds of\nstudies (N=23 and 22, respectively) with individuals with eating disorders to\nunderstand their motivations and practices of consuming digital food content.\nOur study reveals that participants anticipate positive effects from food media\nto overcome their condition, but in practice, it often exacerbates their\ndisorder. We also discovered that many participants experienced a cycle of\nquitting and returning to digital food content consumption. Based on these\nfindings, we articulate design implications for digital food content and\nmultimedia platforms to support vulnerable individuals.\n","authors":["Ryuhaerang Choi","Subin Park","Sujin Han","Sung-Ju Lee"],"pdf_url":"https://arxiv.org/pdf/2311.05920v3.pdf","comment":"25 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.16132v2","updated":"2024-09-23T05:08:20Z","published":"2024-08-28T20:48:04Z","title":"SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge","summary":" With the advancements in singing voice generation and the growing presence of\nAI singers on media platforms, the inaugural Singing Voice Deepfake Detection\n(SVDD) Challenge aims to advance research in identifying AI-generated singing\nvoices from authentic singers. This challenge features two tracks: a controlled\nsetting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The\nCtrSVDD track utilizes publicly available singing vocal data to generate\ndeepfakes using state-of-the-art singing voice synthesis and conversion\nsystems. Meanwhile, the WildSVDD track expands upon the existing SingFake\ndataset, which includes data sourced from popular user-generated content\nwebsites. For the CtrSVDD track, we received submissions from 47 teams, with 37\nsurpassing our baselines and the top team achieving a 1.65% equal error rate.\nFor the WildSVDD track, we benchmarked the baselines. This paper reviews these\nresults, discusses key findings, and outlines future directions for SVDD\nresearch.\n","authors":["You Zhang","Yongyi Zang","Jiatong Shi","Ryuichi Yamamoto","Tomoki Toda","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16132v2.pdf","comment":"6 pages, Accepted by 2024 IEEE Spoken Language Technology Workshop\n (SLT 2024)"},{"id":"http://arxiv.org/abs/2409.14708v1","updated":"2024-09-23T05:01:43Z","published":"2024-09-23T05:01:43Z","title":"A Multimedia Framework for Continuum Robots: Systematic, Computational,\n and Control Perspectives","summary":" Continuum robots, which often rely on interdisciplinary and multimedia\ncollaborations, have been increasingly recognized for their potential to\nrevolutionize the field of human-robot interaction (HRI) in varied applications\ndue to their adaptive, responsive, and flexible characteristics. Despite their\npromises, the lack of an integrated framework poses significant challenges for\nboth users and developers, resulting in inefficiency and complexity during\npreliminary developments. Thus, this paper introduces a unified framework for\nbionic robotics that addresses these challenges by integrating system\narchitecture, dynamics computation, and control strategy. The proposed method\nallows for efficient modeling and quick preview of the results in both digital\nand physical environments, which can enhance the quality of robot developments.\n","authors":["Po-Yu Hsieh","June-Hao Hou"],"pdf_url":"https://arxiv.org/pdf/2409.14708v1.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.14703v1","updated":"2024-09-23T04:49:08Z","published":"2024-09-23T04:49:08Z","title":"MemeCLIP: Leveraging CLIP Representations for Multimodal Meme\n Classification","summary":" The complexity of text-embedded images presents a formidable challenge in\nmachine learning given the need for multimodal understanding of the multiple\naspects of expression conveyed in them. While previous research in multimodal\nanalysis has primarily focused on singular aspects such as hate speech and its\nsubclasses, our study expands the focus to encompass multiple aspects of\nlinguistics: hate, target, stance, and humor detection. We introduce a novel\ndataset PrideMM comprising text-embedded images associated with the LGBTQ+\nPride movement, thereby addressing a serious gap in existing resources. We\nconduct extensive experimentation on PrideMM by using unimodal and multimodal\nbaseline methods to establish benchmarks for each task. Additionally, we\npropose a novel framework MemeCLIP for efficient downstream learning while\npreserving the knowledge of the pre-trained CLIP model. The results of our\nexperiments show that MemeCLIP achieves superior performance compared to\npreviously proposed frameworks on two real-world datasets. We further compare\nthe performance of MemeCLIP and zero-shot GPT-4 on the hate classification\ntask. Finally, we discuss the shortcomings of our model by qualitatively\nanalyzing misclassified samples. Our code and dataset are publicly available\nat: https://github.com/SiddhantBikram/MemeCLIP.\n","authors":["Siddhant Bikram Shah","Shuvam Shiwakoti","Maheep Chaudhary","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.14703v1.pdf","comment":"Accepted to EMNLP 2024 (Main)"}]},"2024-09-22T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.14609v1","updated":"2024-09-22T22:09:35Z","published":"2024-09-22T22:09:35Z","title":"Nirjas: An open source framework for extracting metadata from the source\n code","summary":" Metadata and comments are critical elements of any software development\nprocess. In this paper, we explain how metadata and comments in source code can\nplay an essential role in comprehending software. We introduce a Python-based\nopen-source framework, Nirjas, which helps in extracting this metadata in a\nstructured manner. Various syntaxes, types, and widely accepted conventions\nexist for adding comments in source files of different programming languages.\nEdge cases can create noise in extraction, for which we use Regex to accurately\nretrieve metadata. Non-Regex methods can give results but often miss accuracy\nand noise separation. Nirjas also separates different types of comments, source\ncode, and provides details about those comments, such as line number, file\nname, language used, total SLOC, etc. Nirjas is a standalone Python\nframework/library and can be easily installed via source or pip (the Python\npackage installer). Nirjas was initially created as part of a Google Summer of\nCode project and is currently developed and maintained under the FOSSology\norganization.\n","authors":["Ayush Bhardwaj"," Sahil","Kaushlendra Pratap","Gaurav Mishra"],"pdf_url":"https://arxiv.org/pdf/2409.14609v1.pdf","comment":"2022 12th International Conference on Cloud Computing, Data Science &\n Engineering (Confluence)"},{"id":"http://arxiv.org/abs/2409.14516v1","updated":"2024-09-22T16:20:00Z","published":"2024-09-22T16:20:00Z","title":"Beyond Words: Evaluating Large Language Models in Transportation\n Planning","summary":" The resurgence and rapid advancement of Generative Artificial Intelligence\n(GenAI) in 2023 has catalyzed transformative shifts across numerous industry\nsectors, including urban transportation and logistics. This study investigates\nthe evaluation of Large Language Models (LLMs), specifically GPT-4 and\nPhi-3-mini, to enhance transportation planning. The study assesses the\nperformance and spatial comprehension of these models through a\ntransportation-informed evaluation framework that includes general geospatial\nskills, general transportation domain skills, and real-world transportation\nproblem-solving. Utilizing a mixed-methods approach, the research encompasses\nan evaluation of the LLMs' general Geographic Information System (GIS) skills,\ngeneral transportation domain knowledge as well as abilities to support human\ndecision-making in the real-world transportation planning scenarios of\ncongestion pricing. Results indicate that GPT-4 demonstrates superior accuracy\nand reliability across various GIS and transportation-specific tasks compared\nto Phi-3-mini, highlighting its potential as a robust tool for transportation\nplanners. Nonetheless, Phi-3-mini exhibits competence in specific analytical\nscenarios, suggesting its utility in resource-constrained environments. The\nfindings underscore the transformative potential of GenAI technologies in urban\ntransportation planning. Future work could explore the application of newer\nLLMs and the impact of Retrieval-Augmented Generation (RAG) techniques, on a\nbroader set of real-world transportation planning and operations challenges, to\ndeepen the integration of advanced AI models in transportation management\npractices.\n","authors":["Shaowei Ying","Zhenlong Li","Manzhu Yu"],"pdf_url":"https://arxiv.org/pdf/2409.14516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02377v2","updated":"2024-09-22T14:50:52Z","published":"2024-06-04T14:55:14Z","title":"XRec: Large Language Models for Explainable Recommendation","summary":" Recommender systems help users navigate information overload by providing\npersonalized recommendations aligned with their preferences. Collaborative\nFiltering (CF) is a widely adopted approach, but while advanced techniques like\ngraph neural networks (GNNs) and self-supervised learning (SSL) have enhanced\nCF models for better user representations, they often lack the ability to\nprovide explanations for the recommended items. Explainable recommendations aim\nto address this gap by offering transparency and insights into the\nrecommendation decision-making process, enhancing users' understanding. This\nwork leverages the language capabilities of Large Language Models (LLMs) to\npush the boundaries of explainable recommender systems. We introduce a\nmodel-agnostic framework called XRec, which enables LLMs to provide\ncomprehensive explanations for user behaviors in recommender systems. By\nintegrating collaborative signals and designing a lightweight collaborative\nadaptor, the framework empowers LLMs to understand complex patterns in\nuser-item interactions and gain a deeper understanding of user preferences. Our\nextensive experiments demonstrate the effectiveness of XRec, showcasing its\nability to generate comprehensive and meaningful explanations that outperform\nbaseline approaches in explainable recommender systems. We open-source our\nmodel implementation at https://github.com/HKUDS/XRec.\n","authors":["Qiyao Ma","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02377v2.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2407.06716v2","updated":"2024-09-22T13:08:11Z","published":"2024-07-09T09:43:42Z","title":"Analyzing the Effectiveness of Listwise Reranking with Positional\n Invariance on Temporal Generalizability","summary":" This working note outlines our participation in the retrieval task at CLEF\n2024. We highlight the considerable gap between studying retrieval performance\non static knowledge documents and understanding performance in real-world\nenvironments. Therefore, Addressing these discrepancies and measuring the\ntemporal persistence of IR systems is crucial. By investigating the LongEval\nbenchmark, specifically designed for such dynamic environments, our findings\ndemonstrate the effectiveness of a listwise reranking approach, which\nproficiently handles inaccuracies induced by temporal distribution shifts.\nAmong listwise rerankers, our findings show that ListT5, which effectively\nmitigates the positional bias problem by adopting the Fusion-in-Decoder\narchitecture, is especially effective, and more so, as temporal drift\nincreases, on the test-long subset.\n","authors":["Soyoung Yoon","Jongyoon Kim","Seung-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.06716v2.pdf","comment":"Accepted at CLEF 2024 LongEval track"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.14340v1","updated":"2024-09-22T06:57:33Z","published":"2024-09-22T06:57:33Z","title":"Self-Supervised Audio-Visual Soundscape Stylization","summary":" Speech sounds convey a great deal of information about the scenes, resulting\nin a variety of effects ranging from reverberation to additional ambient\nsounds. In this paper, we manipulate input speech to sound as though it was\nrecorded within a different scene, given an audio-visual conditional example\nrecorded from that scene. Our model learns through self-supervision, taking\nadvantage of the fact that natural video contains recurring sound events and\ntextures. We extract an audio clip from a video and apply speech enhancement.\nWe then train a latent diffusion model to recover the original speech, using\nanother audio-visual clip taken from elsewhere in the video as a conditional\nhint. Through this process, the model learns to transfer the conditional\nexample's sound properties to the input speech. We show that our model can be\nsuccessfully trained using unlabeled, in-the-wild videos, and that an\nadditional visual signal can improve its sound prediction abilities. Please see\nour project webpage for video results:\nhttps://tinglok.netlify.app/files/avsoundscape/\n","authors":["Tingle Li","Renhao Wang","Po-Yao Huang","Andrew Owens","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2409.14340v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.14319v1","updated":"2024-09-22T05:13:11Z","published":"2024-09-22T05:13:11Z","title":"Scene-Text Grounding for Text-Based Video Question Answering","summary":" Existing efforts in text-based video question answering (TextVideoQA) are\ncriticized for their opaque decisionmaking and heavy reliance on scene-text\nrecognition. In this paper, we propose to study Grounded TextVideoQA by forcing\nmodels to answer questions and spatio-temporally localize the relevant\nscene-text regions, thus decoupling QA from scenetext recognition and promoting\nresearch towards interpretable QA. The task has three-fold significance. First,\nit encourages scene-text evidence versus other short-cuts for answer\npredictions. Second, it directly accepts scene-text regions as visual answers,\nthus circumventing the problem of ineffective answer evaluation by stringent\nstring matching. Third, it isolates the challenges inherited in VideoQA and\nscene-text recognition. This enables the diagnosis of the root causes for\nfailure predictions, e.g., wrong QA or wrong scene-text recognition? To achieve\nGrounded TextVideoQA, we propose the T2S-QA model that highlights a\ndisentangled temporal-to-spatial contrastive learning strategy for\nweakly-supervised scene-text grounding and grounded TextVideoQA. To facilitate\nevaluation, we construct a new dataset ViTXT-GQA which features 52K scene-text\nbounding boxes within 2.2K temporal segments related to 2K questions and 729\nvideos. With ViTXT-GQA, we perform extensive experiments and demonstrate the\nsevere limitations of existing techniques in Grounded TextVideoQA. While T2S-QA\nachieves superior results, the large performance gap with human leaves ample\nspace for improvement. Our further analysis of oracle scene-text inputs posits\nthat the major challenge is scene-text recognition. To advance the research of\nGrounded TextVideoQA, our dataset and code are at\n\\url{https://github.com/zhousheng97/ViTXT-GQA.git}\n","authors":["Sheng Zhou","Junbin Xiao","Xun Yang","Peipei Song","Dan Guo","Angela Yao","Meng Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2409.14319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05100v2","updated":"2024-09-22T03:53:13Z","published":"2023-11-09T02:24:51Z","title":"Self-similarity Prior Distillation for Unsupervised Remote Physiological\n Measurement","summary":" Remote photoplethysmography (rPPG) is a noninvasive technique that aims to\ncapture subtle variations in facial pixels caused by changes in blood volume\nresulting from cardiac activities. Most existing unsupervised methods for rPPG\ntasks focus on the contrastive learning between samples while neglecting the\ninherent self-similar prior in physiological signals. In this paper, we propose\na Self-Similarity Prior Distillation (SSPD) framework for unsupervised rPPG\nestimation, which capitalizes on the intrinsic self-similarity of cardiac\nactivities. Specifically, we first introduce a physical-prior embedded\naugmentation technique to mitigate the effect of various types of noise. Then,\nwe tailor a self-similarity-aware network to extract more reliable self-similar\nphysiological features. Finally, we develop a hierarchical self-distillation\nparadigm to assist the network in disentangling self-similar physiological\npatterns from facial videos. Comprehensive experiments demonstrate that the\nunsupervised SSPD framework achieves comparable or even superior performance\ncompared to the state-of-the-art supervised methods. Meanwhile, SSPD maintains\nthe lowest inference time and computation cost among end-to-end models.\n","authors":["Xinyu Zhang","Weiyu Sun","Hao Lu","Ying Chen","Yun Ge","Xiaolin Huang","Jie Yuan","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.05100v2.pdf","comment":null}]},"2024-09-21T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.14217v1","updated":"2024-09-21T18:39:53Z","published":"2024-09-21T18:39:53Z","title":"Revisiting BPR: A Replicability Study of a Common Recommender System\n Baseline","summary":" Bayesian Personalized Ranking (BPR), a collaborative filtering approach based\non matrix factorization, frequently serves as a benchmark for recommender\nsystems research. However, numerous studies often overlook the nuances of BPR\nimplementation, claiming that it performs worse than newly proposed methods\nacross various tasks. In this paper, we thoroughly examine the features of the\nBPR model, indicating their impact on its performance, and investigate\nopen-source BPR implementations. Our analysis reveals inconsistencies between\nthese implementations and the original BPR paper, leading to a significant\ndecrease in performance of up to 50% for specific implementations. Furthermore,\nthrough extensive experiments on real-world datasets under modern evaluation\nsettings, we demonstrate that with proper tuning of its hyperparameters, the\nBPR model can achieve performance levels close to state-of-the-art methods on\nthe top-n recommendation tasks and even outperform them on specific datasets.\nSpecifically, on the Million Song Dataset, the BPR model with hyperparameters\ntuning statistically significantly outperforms Mult-VAE by 10% in NDCG@100 with\nbinary relevance function.\n","authors":["Aleksandr Milogradskii","Oleg Lashinin","Alexander P","Marina Ananyeva","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2409.14217v1.pdf","comment":"This paper is accepted at the Reproducibility track of the ACM RecSys\n '24 conference"},{"id":"http://arxiv.org/abs/2409.14192v1","updated":"2024-09-21T16:46:15Z","published":"2024-09-21T16:46:15Z","title":"Knowledge in Triples for LLMs: Enhancing Table QA Accuracy with Semantic\n Extraction","summary":" Integrating structured knowledge from tabular formats poses significant\nchallenges within natural language processing (NLP), mainly when dealing with\ncomplex, semi-structured tables like those found in the FeTaQA dataset. These\ntables require advanced methods to interpret and generate meaningful responses\naccurately. Traditional approaches, such as SQL and SPARQL, often fail to fully\ncapture the semantics of such data, especially in the presence of irregular\ntable structures like web tables. This paper addresses these challenges by\nproposing a novel approach that extracts triples straightforward from tabular\ndata and integrates it with a retrieval-augmented generation (RAG) model to\nenhance the accuracy, coherence, and contextual richness of responses generated\nby a fine-tuned GPT-3.5-turbo-0125 model. Our approach significantly\noutperforms existing baselines on the FeTaQA dataset, particularly excelling in\nSacre-BLEU and ROUGE metrics. It effectively generates contextually accurate\nand detailed long-form answers from tables, showcasing its strength in complex\ndata interpretation.\n","authors":["Hossein Sholehrasa","Sanaz Saki Norouzi","Pascal Hitzler","Majid Jaberi-Douraki"],"pdf_url":"https://arxiv.org/pdf/2409.14192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14078v1","updated":"2024-09-21T09:13:50Z","published":"2024-09-21T09:13:50Z","title":"Data Generation via Latent Factor Simulation for Fairness-aware\n Re-ranking","summary":" Synthetic data is a useful resource for algorithmic research. It allows for\nthe evaluation of systems under a range of conditions that might be difficult\nto achieve in real world settings. In recommender systems, the use of synthetic\ndata is somewhat limited; some work has concentrated on building user-item\ninteraction data at large scale. We believe that fairness-aware recommendation\nresearch can benefit from simulated data as it allows the study of protected\ngroups and their interactions without depending on sensitive data that needs\nprivacy protection. In this paper, we propose a novel type of data for\nfairness-aware recommendation: synthetic recommender system outputs that can be\nused to study re-ranking algorithms.\n","authors":["Elena Stefancova","Cassidy All","Joshua Paup","Martin Homola","Nicholas Mattei","Robin Burke"],"pdf_url":"https://arxiv.org/pdf/2409.14078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10081v3","updated":"2024-09-21T08:27:16Z","published":"2024-03-15T07:45:37Z","title":"DRAGIN: Dynamic Retrieval Augmented Generation based on the Information\n Needs of Large Language Models","summary":" Dynamic retrieval augmented generation (RAG) paradigm actively decides when\nand what to retrieve during the text generation process of Large Language\nModels (LLMs). There are two key elements of this paradigm: identifying the\noptimal moment to activate the retrieval module (deciding when to retrieve) and\ncrafting the appropriate query once retrieval is triggered (determining what to\nretrieve). However, current dynamic RAG methods fall short in both aspects.\nFirstly, the strategies for deciding when to retrieve often rely on static\nrules. Moreover, the strategies for deciding what to retrieve typically limit\nthemselves to the LLM's most recent sentence or the last few tokens, while the\nLLM's real-time information needs may span across the entire context. To\novercome these limitations, we introduce a new framework, DRAGIN, i.e., Dynamic\nRetrieval Augmented Generation based on the real-time Information Needs of\nLLMs. Our framework is specifically designed to make decisions on when and what\nto retrieve based on the LLM's real-time information needs during the text\ngeneration process. We evaluate DRAGIN along with existing methods\ncomprehensively over 4 knowledge-intensive generation datasets. Experimental\nresults show that DRAGIN achieves superior performance on all tasks,\ndemonstrating the effectiveness of our method. We have open-sourced all the\ncode, data, and models in GitHub: https://github.com/oneal2000/DRAGIN/tree/main\n","authors":["Weihang Su","Yichen Tang","Qingyao Ai","Zhijing Wu","Yiqun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14038v1","updated":"2024-09-21T06:49:34Z","published":"2024-09-21T06:49:34Z","title":"OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model\n Hallucinations in Ontology Matching","summary":" Hallucinations of large language models (LLMs) commonly occur in\ndomain-specific downstream tasks, with no exception in ontology matching (OM).\nThe prevalence of using LLMs for OM raises the need for benchmarks to better\nunderstand LLM hallucinations. The OAEI-LLM dataset is an extended version of\nthe Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate\nLLM-specific hallucinations in OM tasks. We outline the methodology used in\ndataset construction and schema extension, and provide examples of potential\nuse cases.\n","authors":["Zhangcheng Qiang","Kerry Taylor","Weiqing Wang","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.14038v1.pdf","comment":"4 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.14034v1","updated":"2024-09-21T06:32:28Z","published":"2024-09-21T06:32:28Z","title":"Cost-Effective Community-Hierarchy-Based Mutual Voting Approach for\n Influence Maximization in Complex Networks","summary":" Various types of promising techniques have come into being for influence\nmaximization whose aim is to identify influential nodes in complex networks. In\nessence, real-world applications usually have high requirements on the balance\nbetween time complexity and accuracy of influential nodes identification. To\naddress the challenges of imperfect node influence measurement and inefficient\nseed nodes selection mechanism in such class of foregoing techniques, this\narticle proposes a novel approach called Cost-Effective\nCommunity-Hierarchy-Based Mutual Voting for influence maximization in complex\nnetworks. First, we develop a method for measuring the importance of different\nnodes in networks based on an original concept of Dual-Scale\nCommunity-Hierarchy Information that synthesizes both hierarchy structural\ninformation and community structural information of nodes. The community\nstructural information contained in the nodes is measured by a new notion of\nHierarchical-Community Entropy. Second, we develop a method named\nCost-Effective Mutual-Influence-based Voting for seed nodes selection.\nHereinto, a low-computational-cost mutual voting mechanism and an updating\nstrategy called Lazy Score Updating Strategy are newly constructed for\noptimizing the selecting of seed nodes. Third, we develop a balance index to\nevaluate the performance of different methods in striking the tradeoff between\ntime complexity and the accuracy of influential nodes identification. Finally,\nwe demonstrate the approach performance over ten public datasets. The extensive\nexperiments show that the proposed approach outperforms 16 state-of-the-art\ntechniques on the balance between time complexity and accuracy of influential\nnodes identification. Compared with the method with the second highest value of\nthe balance index, our approach can be improved by at most 9.29%.\n","authors":["Yi Liu","Xiaoan Tang","Witold Pedrycz","Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.14034v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.14087v1","updated":"2024-09-21T09:53:11Z","published":"2024-09-21T09:53:11Z","title":"BRep Boundary and Junction Detection for CAD Reverse Engineering","summary":" In machining process, 3D reverse engineering of the mechanical system is an\nintegral, highly important, and yet time consuming step to obtain parametric\nCAD models from 3D scans. Therefore, deep learning-based Scan-to-CAD modeling\ncan offer designers enormous editability to quickly modify CAD model, being\nable to parse all its structural compositions and design steps. In this paper,\nwe propose a supervised boundary representation (BRep) detection network\nBRepDetNet from 3D scans of CC3D and ABC dataset. We have carefully annotated\nthe 50K and 45K scans of both the datasets with appropriate topological\nrelations (e.g., next, mate, previous) between the geometrical primitives\n(i.e., boundaries, junctions, loops, faces) of their BRep data structures. The\nproposed solution decomposes the Scan-to-CAD problem in Scan-to-BRep ensuring\nthe right step towards feature-based modeling, and therefore, leveraging other\nexisting BRep-to-CAD modeling methods. Our proposed Scan-to-BRep neural network\nlearns to detect BRep boundaries and junctions by minimizing focal-loss and\nnon-maximal suppression (NMS) during training time. Experimental results show\nthat our BRepDetNet with NMS-Loss achieves impressive results.\n","authors":["Sk Aziz Ali","Mohammad Sadil Khan","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2409.14087v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.13982v1","updated":"2024-09-21T02:17:35Z","published":"2024-09-21T02:17:35Z","title":"CUS3D :CLIP-based Unsupervised 3D Segmentation via Object-level Denoise","summary":" To ease the difficulty of acquiring annotation labels in 3D data, a common\nmethod is using unsupervised and open-vocabulary semantic segmentation, which\nleverage 2D CLIP semantic knowledge. In this paper, unlike previous research\nthat ignores the ``noise'' raised during feature projection from 2D to 3D, we\npropose a novel distillation learning framework named CUS3D. In our approach,\nan object-level denosing projection module is designed to screen out the\n``noise'' and ensure more accurate 3D feature. Based on the obtained features,\na multimodal distillation learning module is designed to align the 3D feature\nwith CLIP semantic feature space with object-centered constrains to achieve\nadvanced unsupervised semantic segmentation. We conduct comprehensive\nexperiments in both unsupervised and open-vocabulary segmentation, and the\nresults consistently showcase the superiority of our model in achieving\nadvanced unsupervised segmentation results and its effectiveness in\nopen-vocabulary segmentation.\n","authors":["Fuyang Yu","Runze Tian","Zhen Wang","Xiaochuan Wang","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2409.13982v1.pdf","comment":"6 pages,3 figures"}]},"2024-09-20T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.13888v1","updated":"2024-09-20T20:39:23Z","published":"2024-09-20T20:39:23Z","title":"Causal Feature Selection Method for Contextual Multi-Armed Bandits in\n Recommender System","summary":" Features (a.k.a. context) are critical for contextual multi-armed bandits\n(MAB) performance. In practice of large scale online system, it is important to\nselect and implement important features for the model: missing important\nfeatures can led to sub-optimal reward outcome, and including irrelevant\nfeatures can cause overfitting, poor model interpretability, and implementation\ncost. However, feature selection methods for conventional machine learning\nmodels fail short for contextual MAB use cases, as conventional methods select\nfeatures correlated with the outcome variable, but not necessarily causing\nheterogeneuous treatment effect among arms which are truely important for\ncontextual MAB. In this paper, we introduce model-free feature selection\nmethods designed for contexutal MAB problem, based on heterogeneous causal\neffect contributed by the feature to the reward distribution. Empirical\nevaluation is conducted based on synthetic data as well as real data from an\nonline experiment for optimizing content cover image in a recommender system.\nThe results show this feature selection method effectively selects the\nimportant features that lead to higher contextual MAB reward than unimportant\nfeatures. Compared with model embedded method, this model-free method has\nadvantage of fast computation speed, ease of implementation, and prune of model\nmis-specification issues.\n","authors":["Zhenyu Zhao","Yexi Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.13888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13847v1","updated":"2024-09-20T18:42:04Z","published":"2024-09-20T18:42:04Z","title":"Segment Discovery: Enhancing E-commerce Targeting","summary":" Modern e-commerce services frequently target customers with incentives or\ninterventions to engage them in their products such as games, shopping, video\nstreaming, etc. This customer engagement increases acquisition of more\ncustomers and retention of existing ones, leading to more business for the\ncompany while improving customer experience. Often, customers are either\nrandomly targeted or targeted based on the propensity of desirable behavior.\nHowever, such policies can be suboptimal as they do not target the set of\ncustomers who would benefit the most from the intervention and they may also\nnot take account of any constraints. In this paper, we propose a policy\nframework based on uplift modeling and constrained optimization that identifies\ncustomers to target for a use-case specific intervention so as to maximize the\nvalue to the business, while taking account of any given constraints. We\ndemonstrate improvement over state-of-the-art targeting approaches using two\nlarge-scale experimental studies and a production implementation.\n","authors":["Qiqi Li","Roopali Singh","Charin Polpanumas","Tanner Fiez","Namita Kumar","Shreya Chakrabarti"],"pdf_url":"https://arxiv.org/pdf/2409.13847v1.pdf","comment":"Accepted at the CONSEQUENCES'24 workshop, co-located with ACM\n RecSys'24"},{"id":"http://arxiv.org/abs/2305.16326v3","updated":"2024-09-20T18:17:38Z","published":"2023-05-10T13:40:06Z","title":"Large language models in biomedical natural language processing:\n benchmarks, baselines, and recommendations","summary":" Biomedical literature is growing rapidly, making it challenging to curate and\nextract knowledge manually. Biomedical natural language processing (BioNLP)\ntechniques that can automatically extract information from biomedical\nliterature help alleviate this burden. Recently, large Language Models (LLMs),\nsuch as GPT-3 and GPT-4, have gained significant attention for their impressive\nperformance. However, their effectiveness in BioNLP tasks and impact on method\ndevelopment and downstream users remain understudied. This pilot study (1)\nestablishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and\none-shot settings in eight BioNLP datasets across four applications: named\nentity recognition, relation extraction, multi-label document classification,\nand semantic similarity and reasoning, (2) examines the errors produced by the\nLLMs and categorized the errors into three types: missingness, inconsistencies,\nand unwanted artificial content, and (3) provides suggestions for using LLMs in\nBioNLP applications. We make the datasets, baselines, and results publicly\navailable to the community via\nhttps://github.com/qingyu-qc/gpt_bionlp_benchmark.\n","authors":["Qingyu Chen","Jingcheng Du","Yan Hu","Vipina Kuttichi Keloth","Xueqing Peng","Kalpana Raja","Rui Zhang","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16089v3","updated":"2024-09-20T17:53:41Z","published":"2023-07-29T22:40:59Z","title":"Train Once, Use Flexibly: A Modular Framework for Multi-Aspect Neural\n News Recommendation","summary":" Recent neural news recommenders (NNRs) extend content-based recommendation\n(1) by aligning additional aspects (e.g., topic, sentiment) between candidate\nnews and user history or (2) by diversifying recommendations w.r.t. these\naspects. This customization is achieved by ``hardcoding`` additional\nconstraints into the NNR's architecture and/or training objectives: any change\nin the desired recommendation behavior thus requires retraining the model with\na modified objective. This impedes widespread adoption of multi-aspect news\nrecommenders. In this work, we introduce MANNeR, a modular framework for\nmulti-aspect neural news recommendation that supports on-the-fly customization\nover individual aspects at inference time. With metric-based learning as its\nbackbone, MANNeR learns aspect-specialized news encoders and then flexibly and\nlinearly combines the resulting aspect-specific similarity scores into\ndifferent ranking functions, alleviating the need for ranking function-specific\nretraining of the model. Extensive experimental results show that MANNeR\nconsistently outperforms state-of-the-art NNRs on both standard content-based\nrecommendation and single- and multi-aspect customization. Lastly, we validate\nthat MANNeR's aspect-customization module is robust to language and domain\ntransfer.\n","authors":["Andreea Iana","Goran Glavaš","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2307.16089v3.pdf","comment":"Accepted at the 2024 Conference on Empirical Methods in Natural\n Language Processing (EMNLP 2024)"},{"id":"http://arxiv.org/abs/2409.13628v1","updated":"2024-09-20T16:36:46Z","published":"2024-09-20T16:36:46Z","title":"Beauty Beyond Words: Explainable Beauty Product Recommendations Using\n Ingredient-Based Product Attributes","summary":" Accurate attribute extraction is critical for beauty product recommendations\nand building trust with customers. This remains an open problem, as existing\nsolutions are often unreliable and incomplete. We present a system to extract\nbeauty-specific attributes using end-to-end supervised learning based on beauty\nproduct ingredients. A key insight to our system is a novel energy-based\nimplicit model architecture. We show that this implicit model architecture\noffers significant benefits in terms of accuracy, explainability, robustness,\nand flexibility. Furthermore, our implicit model can be easily fine-tuned to\nincorporate additional attributes as they become available, making it more\nuseful in real-world applications. We validate our model on a major e-commerce\nskincare product catalog dataset and demonstrate its effectiveness. Finally, we\nshowcase how ingredient-based attribute extraction contributes to enhancing the\nexplainability of beauty recommendations.\n","authors":["Siliang Liu","Rahul Suresh","Amin Banitalebi-Dehkordi"],"pdf_url":"https://arxiv.org/pdf/2409.13628v1.pdf","comment":"18th ACM Conference on Recommender Systems, Workshop on Strategic and\n Utility-aware REcommendation"},{"id":"http://arxiv.org/abs/2409.13621v1","updated":"2024-09-20T16:32:54Z","published":"2024-09-20T16:32:54Z","title":"Advancing Event Causality Identification via Heuristic Semantic\n Dependency Inquiry Network","summary":" Event Causality Identification (ECI) focuses on extracting causal relations\nbetween events in texts. Existing methods for ECI primarily rely on causal\nfeatures and external knowledge. However, these approaches fall short in two\ndimensions: (1) causal features between events in a text often lack explicit\nclues, and (2) external knowledge may introduce bias, while specific problems\nrequire tailored analyses. To address these issues, we propose SemDI - a simple\nand effective Semantic Dependency Inquiry Network for ECI. SemDI captures\nsemantic dependencies within the context using a unified encoder. Then, it\nutilizes a Cloze Analyzer to generate a fill-in token based on comprehensive\ncontext understanding. Finally, this fill-in token is used to inquire about the\ncausal relation between two events. Extensive experiments demonstrate the\neffectiveness of SemDI, surpassing state-of-the-art methods on three widely\nused benchmarks. Code is available at https://github.com/hrlics/SemDI.\n","authors":["Haoran Li","Qiang Gao","Hongmei Wu","Li Huang"],"pdf_url":"https://arxiv.org/pdf/2409.13621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13545v1","updated":"2024-09-20T14:39:42Z","published":"2024-09-20T14:39:42Z","title":"Data Augmentation for Sequential Recommendation: A Survey","summary":" As an essential branch of recommender systems, sequential recommendation (SR)\nhas received much attention due to its well-consistency with real-world\nsituations. However, the widespread data sparsity issue limits the SR model's\nperformance. Therefore, researchers have proposed many data augmentation (DA)\nmethods to mitigate this phenomenon and have achieved impressive progress. In\nthis survey, we provide a comprehensive review of DA methods for SR. We start\nby introducing the research background and motivation. Then, we categorize\nexisting methodologies regarding their augmentation principles, objects, and\npurposes. Next, we present a comparative discussion of their advantages and\ndisadvantages, followed by the exhibition and analysis of representative\nexperimental results. Finally, we outline directions for future research and\nsummarize this survey. We also maintain a repository with a paper list at\n\\url{https://github.com/KingGugu/DA-CL-4Rec}.\n","authors":["Yizhou Dang","Enneng Yang","Yuting Liu","Guibing Guo","Linying Jiang","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2409.13545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13483v1","updated":"2024-09-20T13:15:53Z","published":"2024-09-20T13:15:53Z","title":"A Multimodal Dense Retrieval Approach for Speech-Based Open-Domain\n Question Answering","summary":" Speech-based open-domain question answering (QA over a large corpus of text\npassages with spoken questions) has emerged as an important task due to the\nincreasing number of users interacting with QA systems via speech interfaces.\nPassage retrieval is a key task in speech-based open-domain QA. So far,\nprevious works adopted pipelines consisting of an automatic speech recognition\n(ASR) model that transcribes the spoken question before feeding it to a dense\ntext retriever. Such pipelines have several limitations. The need for an ASR\nmodel limits the applicability to low-resource languages and specialized\ndomains with no annotated speech data. Furthermore, the ASR model propagates\nits errors to the retriever. In this work, we try to alleviate these\nlimitations by proposing an ASR-free, end-to-end trained multimodal dense\nretriever that can work directly on spoken questions. Our experimental results\nshowed that, on shorter questions, our retriever is a promising alternative to\nthe \\textit{ASR and Retriever} pipeline, achieving better retrieval performance\nin cases where ASR would have mistranscribed important words in the question or\nhave produced a transcription with a high word error rate.\n","authors":["Georgios Sidiropoulos","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2409.13483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13425v1","updated":"2024-09-20T11:46:37Z","published":"2024-09-20T11:46:37Z","title":"Procedure Model for Building Knowledge Graphs for Industry Applications","summary":" Enterprise knowledge graphs combine business data and organizational\nknowledge by means of a semantic network of concepts, properties, individuals\nand relationships. The graph-based integration of previously unconnected\ninformation with domain knowledge provides new insights and enables intelligent\nbusiness applications. However, knowledge graph construction is a large\ninvestment which requires a joint effort of domain and technical experts. This\npaper presents a practical step-by-step procedure model for building an RDF\nknowledge graph that interconnects heterogeneous data and expert knowledge for\nan industry use case. The self-contained process adapts the \"Cross Industry\nStandard Process for Data Mining\" and uses competency questions throughout the\nentire development cycle. The procedure model starts with business and data\nunderstanding, describes tasks for ontology modeling and the graph setup, and\nends with process steps for evaluation and deployment.\n","authors":["Sascha Meckler"],"pdf_url":"https://arxiv.org/pdf/2409.13425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13385v1","updated":"2024-09-20T10:36:49Z","published":"2024-09-20T10:36:49Z","title":"Contextual Compression in Retrieval-Augmented Generation for Large\n Language Models: A Survey","summary":" Large Language Models (LLMs) showcase remarkable abilities, yet they struggle\nwith limitations such as hallucinations, outdated knowledge, opacity, and\ninexplicable reasoning. To address these challenges, Retrieval-Augmented\nGeneration (RAG) has proven to be a viable solution, leveraging external\ndatabases to improve the consistency and coherence of generated content,\nespecially valuable for complex, knowledge-rich tasks, and facilitates\ncontinuous improvement by leveraging domain-specific insights. By combining the\nintrinsic knowledge of LLMs with the vast, dynamic repositories of external\ndatabases, RAG achieves a synergistic effect. However, RAG is not without its\nlimitations, including a limited context window, irrelevant information, and\nthe high processing overhead for extensive contextual data. In this\ncomprehensive work, we explore the evolution of Contextual Compression\nparadigms, providing an in-depth examination of the field. Finally, we outline\nthe current challenges and suggest potential research and development\ndirections, paving the way for future advancements in this area.\n","authors":["Sourav Verma"],"pdf_url":"https://arxiv.org/pdf/2409.13385v1.pdf","comment":"Ongoing Work"},{"id":"http://arxiv.org/abs/2409.13376v1","updated":"2024-09-20T10:24:39Z","published":"2024-09-20T10:24:39Z","title":"More Clustering Quality Metrics for ABCDE","summary":" ABCDE is a technique for evaluating clusterings of very large populations of\nitems. Given two clusterings, namely a Baseline clustering and an Experiment\nclustering, ABCDE can characterize their differences with impact and quality\nmetrics, and thus help to determine which clustering to prefer. We previously\ndescribed the basic quality metrics of ABCDE, namely the GoodSplitRate,\nBadSplitRate, GoodMergeRate, BadMergeRate and DeltaPrecision, and how to\nestimate them on the basis of human judgements. This paper extends that\ntreatment with more quality metrics. It describes a technique that aims to\ncharacterize the DeltaRecall of the clustering change. It introduces a new\nmetric, called IQ, to characterize the degree to which the clustering diff\ntranslates into an improvement in the quality. Ideally, a large diff would\nimprove the quality by a large amount. Finally, this paper mentions ways to\ncharacterize the absolute Precision and Recall of a single clustering with\nABCDE.\n","authors":["Stephan van Staden"],"pdf_url":"https://arxiv.org/pdf/2409.13376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11301v2","updated":"2024-09-20T08:48:30Z","published":"2024-09-17T15:57:33Z","title":"TISIS : Trajectory Indexing for SImilarity Search","summary":" Social media platforms enable users to share diverse types of information,\nincluding geolocation data that captures their movement patterns. Such\ngeolocation data can be leveraged to reconstruct the trajectory of a user's\nvisited Points of Interest (POIs). A key requirement in numerous applications\nis the ability to measure the similarity between such trajectories, as this\nfacilitates the retrieval of trajectories that are similar to a given reference\ntrajectory. This is the main focus of our work. Existing methods predominantly\nrely on applying a similarity function to each candidate trajectory to identify\nthose that are sufficiently similar. However, this approach becomes\ncomputationally expensive when dealing with large-scale datasets. To mitigate\nthis challenge, we propose TISIS, an efficient method that uses trajectory\nindexing to quickly find similar trajectories that share common POIs in the\nsame order. Furthermore, to account for scenarios where POIs in trajectories\nmay not exactly match but are contextually similar, we introduce TISIS*, a\nvariant of TISIS that incorporates POI embeddings. This extension allows for\nmore comprehensive retrieval of similar trajectories by considering semantic\nsimilarities between POIs, beyond mere exact matches. Extensive experimental\nevaluations demonstrate that the proposed approach significantly outperforms a\nbaseline method based on the well-known Longest Common SubSequence (LCSS)\nalgorithm, yielding substantial performance improvements across various\nreal-world datasets.\n","authors":["Sara Jarrad","Hubert Naacke","Stephane Gancarski"],"pdf_url":"https://arxiv.org/pdf/2409.11301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09603v5","updated":"2024-09-20T08:06:34Z","published":"2023-04-19T12:17:46Z","title":"Visualising Personal Data Flows: Insights from a Case Study of\n Booking.com","summary":" Commercial organisations are holding and processing an ever-increasing amount\nof personal data. Policies and laws are continually changing to require these\ncompanies to be more transparent regarding the collection, storage, processing\nand sharing of this data. This paper reports our work of taking Booking.com as\na case study to visualise personal data flows extracted from their privacy\npolicy. By showcasing how the company shares its consumers' personal data, we\nraise questions and extend discussions on the challenges and limitations of\nusing privacy policies to inform online users about the true scale and the\nlandscape of personal data flows. This case study can inform us about future\nresearch on more data flow-oriented privacy policy analysis and on the\nconstruction of a more comprehensive ontology on personal data flows in\ncomplicated business ecosystems.\n","authors":["Haiyue Yuan","Matthew Boakes","Xiao Ma","Dongmei Cao","Shujun Li"],"pdf_url":"https://arxiv.org/pdf/2304.09603v5.pdf","comment":"This is the full edition of a paper published in Intelligent\n Information Systems: CAiSE Forum 2023, Zaragoza, Spain, June 12-16, 2023,\n Proceedings, Lecture Notes in Business Information Processing (LNBIP), Volume\n 477, pp. 52-60, 2023, Springer Nature,\n https://link.springer.com/book/10.1007/978-3-031-34674-3_7"},{"id":"http://arxiv.org/abs/2409.12912v2","updated":"2024-09-20T07:36:54Z","published":"2024-09-19T17:07:31Z","title":"The Relevance of Item-Co-Exposure For Exposure Bias Mitigation","summary":" Through exposing items to users, implicit feedback recommender systems\ninfluence the logged interactions, and, ultimately, their own recommendations.\nThis effect is called exposure bias and it can lead to issues such as filter\nbubbles and echo chambers. Previous research employed the multinomial logit\nmodel (MNL) with exposure information to reduce exposure bias on synthetic\ndata.\n This extended abstract summarizes our previous study in which we investigated\nwhether (i) these findings hold for human-generated choices, (ii) other\ndiscrete choice models mitigate bias better, and (iii) an item's estimated\nrelevance can depend on the relevances of the other items that were presented\nwith it. We collected a data set of biased and unbiased choices in a controlled\nonline user study and measured the effects of overexposure and competition.\n We found that (i) the discrete choice models effectively mitigated exposure\nbias on human-generated choice data, (ii) there were no significant differences\nin robustness among the different discrete choice models, and (iii) only\nmultivariate discrete choice models were robust to competition between items.\nWe conclude that discrete choice models mitigate exposure bias effectively\nbecause they consider item-co-exposure. Moreover, exposing items alongside more\nor less popular items can bias future recommendations significantly and item\nexposure must be tracked for overcoming exposure bias. We consider our work\nvital for understanding what exposure bias it, how it forms, and how it can be\nmitigated.\n","authors":["Thorsten Krause","Alina Deriyeva","Jan Heinrich Beinke","Gerrit York Bartels","Oliver Thomas"],"pdf_url":"https://arxiv.org/pdf/2409.12912v2.pdf","comment":"Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys\n '24"},{"id":"http://arxiv.org/abs/2409.08479v2","updated":"2024-09-20T04:52:16Z","published":"2024-09-13T02:08:47Z","title":"Exploring Information Retrieval Landscapes: An Investigation of a Novel\n Evaluation Techniques and Comparative Document Splitting Methods","summary":" The performance of Retrieval-Augmented Generation (RAG) systems in\ninformation retrieval is significantly influenced by the characteristics of the\ndocuments being processed. In this study, the structured nature of textbooks,\nthe conciseness of articles, and the narrative complexity of novels are shown\nto require distinct retrieval strategies. A comparative evaluation of multiple\ndocument-splitting methods reveals that the Recursive Character Splitter\noutperforms the Token-based Splitter in preserving contextual integrity. A\nnovel evaluation technique is introduced, utilizing an open-source model to\ngenerate a comprehensive dataset of question-and-answer pairs, simulating\nrealistic retrieval scenarios to enhance testing efficiency and metric\nreliability. The evaluation employs weighted scoring metrics, including\nSequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy\nand relevance. This approach establishes a refined standard for evaluating the\nprecision of RAG systems, with future research focusing on optimizing chunk and\noverlap sizes to improve retrieval accuracy and efficiency.\n","authors":["Esmaeil Narimissa","David Raithel"],"pdf_url":"https://arxiv.org/pdf/2409.08479v2.pdf","comment":"This article is 16 pages long and includes detailed comparisons of\n RAG systems and document splitting techniques"},{"id":"http://arxiv.org/abs/2409.13210v1","updated":"2024-09-20T04:37:36Z","published":"2024-09-20T04:37:36Z","title":"A Unified Causal Framework for Auditing Recommender Systems for Ethical\n Concerns","summary":" As recommender systems become widely deployed in different domains, they\nincreasingly influence their users' beliefs and preferences. Auditing\nrecommender systems is crucial as it not only ensures the continuous\nimprovement of recommendation algorithms but also safeguards against potential\nissues like biases and ethical concerns. In this paper, we view recommender\nsystem auditing from a causal lens and provide a general recipe for defining\nauditing metrics. Under this general causal auditing framework, we categorize\nexisting auditing metrics and identify gaps in them -- notably, the lack of\nmetrics for auditing user agency while accounting for the multi-step dynamics\nof the recommendation process. We leverage our framework and propose two\nclasses of such metrics:future- and past-reacheability and stability, that\nmeasure the ability of a user to influence their own and other users'\nrecommendations, respectively. We provide both a gradient-based and a black-box\napproach for computing these metrics, allowing the auditor to compute them\nunder different levels of access to the recommender system. In our experiments,\nwe demonstrate the efficacy of methods for computing the proposed metrics and\ninspect the design of recommender systems through these proposed metrics.\n","authors":["Vibhhu Sharma","Shantanu Gupta","Nil-Jana Akpinar","Zachary C. Lipton","Liu Leqi"],"pdf_url":"https://arxiv.org/pdf/2409.13210v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2409.13175v1","updated":"2024-09-20T03:02:42Z","published":"2024-09-20T03:02:42Z","title":"RPAF: A Reinforcement Prediction-Allocation Framework for Cache\n Allocation in Large-Scale Recommender Systems","summary":" Modern recommender systems are built upon computation-intensive\ninfrastructure, and it is challenging to perform real-time computation for each\nrequest, especially in peak periods, due to the limited computational\nresources. Recommending by user-wise result caches is widely used when the\nsystem cannot afford a real-time recommendation. However, it is challenging to\nallocate real-time and cached recommendations to maximize the users' overall\nengagement. This paper shows two key challenges to cache allocation, i.e., the\nvalue-strategy dependency and the streaming allocation. Then, we propose a\nreinforcement prediction-allocation framework (RPAF) to address these issues.\nRPAF is a reinforcement-learning-based two-stage framework containing\nprediction and allocation stages. The prediction stage estimates the values of\nthe cache choices considering the value-strategy dependency, and the allocation\nstage determines the cache choices for each individual request while satisfying\nthe global budget constraint. We show that the challenge of training RPAF\nincludes globality and the strictness of budget constraints, and a relaxed\nlocal allocator (RLA) is proposed to address this issue. Moreover, a PoolRank\nalgorithm is used in the allocation stage to deal with the streaming allocation\nproblem. Experiments show that RPAF significantly improves users' engagement\nunder computational budget constraints.\n","authors":["Shuo Su","Xiaoshuang Chen","Yao Wang","Yulin Wu","Ziqiang Zhang","Kaiqiao Zhan","Ben Wang","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2409.13175v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.13689v1","updated":"2024-09-20T17:59:01Z","published":"2024-09-20T17:59:01Z","title":"Temporally Aligned Audio for Video with Autoregression","summary":" We introduce V-AURA, the first autoregressive model to achieve high temporal\nalignment and relevance in video-to-audio generation. V-AURA uses a\nhigh-framerate visual feature extractor and a cross-modal audio-visual feature\nfusion strategy to capture fine-grained visual motion events and ensure precise\ntemporal alignment. Additionally, we propose VisualSound, a benchmark dataset\nwith high audio-visual relevance. VisualSound is based on VGGSound, a video\ndataset consisting of in-the-wild samples extracted from YouTube. During the\ncuration, we remove samples where auditory events are not aligned with the\nvisual ones. V-AURA outperforms current state-of-the-art models in temporal\nalignment and semantic relevance while maintaining comparable audio quality.\nCode, samples, VisualSound and models are available at\nhttps://v-aura.notion.site\n","authors":["Ilpo Viertola","Vladimir Iashin","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2409.13689v1.pdf","comment":"Submitted to ICASSP 2025. Project page https://v-aura.notion.site"},{"id":"http://arxiv.org/abs/2309.15889v2","updated":"2024-09-20T11:48:51Z","published":"2023-09-27T16:30:59Z","title":"High Perceptual Quality Wireless Image Delivery with Denoising Diffusion\n Models","summary":" We consider the image transmission problem over a noisy wireless channel via\ndeep learning-based joint source-channel coding (DeepJSCC) along with a\ndenoising diffusion probabilistic model (DDPM) at the receiver. Specifically,\nwe are interested in the perception-distortion trade-off in the practical\nfinite block length regime, in which separate source and channel coding can be\nhighly suboptimal. We introduce a novel scheme, where the conventional DeepJSCC\nencoder targets transmitting a lower resolution version of the image, which\nlater can be refined thanks to the generative model available at the receiver.\nIn particular, we utilize the range-null space decomposition of the target\nimage; DeepJSCC transmits the range-space of the image, while DDPM\nprogressively refines its null space contents. Through extensive experiments,\nwe demonstrate significant improvements in distortion and perceptual quality of\nreconstructed images compared to standard DeepJSCC and the state-of-the-art\ngenerative learning-based method.\n","authors":["Selim F. Yilmaz","Xueyan Niu","Bo Bai","Wei Han","Lei Deng","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2309.15889v2.pdf","comment":"6 pages, 5 figures. Published at INFOCOM 2024 Workshops"},{"id":"http://arxiv.org/abs/2309.04084v2","updated":"2024-09-20T09:22:47Z","published":"2023-09-08T02:50:54Z","title":"Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation","summary":" Modern displays can render video content with high dynamic range (HDR) and\nwide color gamut (WCG). However, most resources are still in standard dynamic\nrange (SDR). Therefore, transforming existing SDR content into the HDRTV\nstandard holds significant value. This paper defines and analyzes the\nSDRTV-to-HDRTV task by modeling the formation of SDRTV/HDRTV content. Our\nfindings reveal that a naive endto-end supervised training approach suffers\nfrom severe gamut transition errors. To address this, we propose a new\nthree-step solution called HDRTVNet++, which includes adaptive global color\nmapping, local enhancement, and highlight refinement. The adaptive global color\nmapping step utilizes global statistics for image-adaptive color adjustments. A\nlocal enhancement network further enhances details, and the two sub-networks\nare combined as a generator to achieve highlight consistency through GANbased\njoint training. Designed for ultra-high-definition TV content, our method is\nboth effective and lightweight for processing 4K resolution images. We also\nconstructed a dataset using HDR videos in the HDR10 standard, named HDRTV1K,\ncontaining 1235 training and 117 testing images, all in 4K resolution.\nAdditionally, we employ five metrics to evaluate SDRTV-to-HDRTV performance.\nOur results demonstrate state-of-the-art performance both quantitatively and\nvisually. The codes and models are available at\nhttps://github.com/xiaom233/HDRTVNet-plus.\n","authors":["Xiangyu Chen","Zheyuan Li","Zhengwen Zhang","Jimmy S. Ren","Yihao Liu","Jingwen He","Yu Qiao","Jiantao Zhou","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.04084v2.pdf","comment":"Extended version of HDRTVNet"},{"id":"http://arxiv.org/abs/2409.13194v1","updated":"2024-09-20T03:55:34Z","published":"2024-09-20T03:55:34Z","title":"ChemDFM-X: Towards Large Multimodal Model for Chemistry","summary":" Rapid developments of AI tools are expected to offer unprecedented assistance\nto the research of natural science including chemistry. However, neither\nexisting unimodal task-specific specialist models nor emerging general large\nmultimodal models (LMM) can cover the wide range of chemical data modality and\ntask categories. To address the real demands of chemists, a cross-modal\nChemical General Intelligence (CGI) system, which serves as a truly practical\nand useful research assistant utilizing the great potential of LMMs, is in\ngreat need. In this work, we introduce the first Cross-modal Dialogue\nFoundation Model for Chemistry (ChemDFM-X). Diverse multimodal data are\ngenerated from an initial modality by approximate calculations and\ntask-specific model predictions. This strategy creates sufficient chemical\ntraining corpora, while significantly reducing excessive expense, resulting in\nan instruction-tuning dataset containing 7.6M data. After instruction\nfinetuning, ChemDFM-X is evaluated on extensive experiments of different\nchemical tasks with various data modalities. The results demonstrate the\ncapacity of ChemDFM-X for multimodal and inter-modal knowledge comprehension.\nChemDFM-X marks a significant milestone toward aligning all modalities in\nchemistry, a step closer to CGI.\n","authors":["Zihan Zhao","Bo Chen","Jingpiao Li","Lu Chen","Liyang Wen","Pengyu Wang","Zichen Zhu","Danyang Zhang","Ziping Wan","Yansi Li","Zhongyang Dai","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.13194v1.pdf","comment":"19 pages, 7 figures, 11 tables"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 95 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Unveiling the Role of Pretraining in Direct Speech Translation EMNLP 2024 + + +
+ Direct speech-to-text translation systems encounter an important drawback in +data scarcity. A common solution consists on pretraining the encoder on +automatic speech recognition, hence losing efficiency in the training process. +In this study, we compare the training dynamics of a system using a pretrained +encoder, the conventional approach, and one trained from scratch. We observe +that, throughout the training, the randomly initialized model struggles to +incorporate information from the speech inputs for its predictions. Hence, we +hypothesize that this issue stems from the difficulty of effectively training +an encoder for direct speech translation. While a model trained from scratch +needs to learn acoustic and semantic modeling simultaneously, a pretrained one +can just focus on the latter. Based on these findings, we propose a subtle +change in the decoder cross-attention to integrate source information from +earlier steps in training. We show that with this change, the model trained +from scratch can achieve comparable performance to the pretrained one, while +reducing the training time. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ Automated Detection and Analysis of Power Words in Persuasive Text Using + Natural Language Processing + + +
+ Power words are terms that evoke strong emotional responses and significantly +influence readers' behavior, playing a crucial role in fields like marketing, +politics, and motivational writing. This study proposes a methodology for the +automated detection and analysis of power words in persuasive text using a +custom lexicon and the TextBlob library in Python. By identifying the presence +and frequency of power words within a given text, we aim to classify and +analyze their impact on sentiment and reader engagement. This research examines +diverse datasets across various domains to provide insights into the +effectiveness of power words, offering practical applications for content +creators, advertisers, and policymakers. + +
+
+
+
+
+ + ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ DARE: Diverse Visual Question Answering with Robustness Evaluation + + +
+ Vision Language Models (VLMs) extend remarkable capabilities of text-only +large language models and vision-only models, and are able to learn from and +process multi-modal vision-text input. While modern VLMs perform well on a +number of standard image classification and image-text matching tasks, they +still struggle with a number of crucial vision-language (VL) reasoning +abilities such as counting and spatial reasoning. Moreover, while they might be +very brittle to small variations in instructions and/or evaluation protocols, +existing benchmarks fail to evaluate their robustness (or rather the lack of +it). In order to couple challenging VL scenarios with comprehensive robustness +evaluation, we introduce DARE, Diverse Visual Question Answering with +Robustness Evaluation, a carefully created and curated multiple-choice VQA +benchmark. DARE evaluates VLM performance on five diverse categories and +includes four robustness-oriented evaluations based on the variations of: +prompts, the subsets of answer options, the output format and the number of +correct answers. Among a spectrum of other findings, we report that +state-of-the-art VLMs still struggle with questions in most categories and are +unable to consistently deliver their peak performance across the tested +robustness evaluations. The worst case performance across the subsets of +options is up to 34% below the performance in the standard case. The robustness +of the open-source VLMs such as LLaVA 1.6 and Idefics2 cannot match the +closed-source models such as GPT-4 and Gemini, but even the latter remain very +brittle to different variations. + +
+
+
+
+
+ + ☆ Multilingual Evaluation of Long Context Retrieval and Reasoning + + +
+ Recent large language models (LLMs) demonstrate impressive capabilities in +handling long contexts, some exhibiting near-perfect recall on synthetic +retrieval tasks. However, these evaluations have mainly focused on English text +and involved a single target sentence within lengthy contexts. Our work +investigates how LLM performance generalizes to multilingual settings with +multiple hidden target sentences. We comprehensively evaluate several +long-context LLMs on retrieval and reasoning tasks across five languages: +English, Vietnamese, Indonesian, Swahili, and Somali. These languages share the +Latin script but belong to distinct language families and resource levels. Our +analysis reveals a significant performance gap between languages. The +best-performing models such as Gemini-1.5 and GPT-4o, achieve around 96% +accuracy in English to around 36% in Somali with a single target sentence. +However, this accuracy drops to 40% in English and 0% in Somali when dealing +with three target sentences. Our findings highlight the challenges long-context +LLMs face when processing longer contexts, an increase in the number of target +sentences, or languages of lower resource levels. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Extracting Affect Aggregates from Longitudinal Social Media Data with + Temporal Adapters for Large Language Models + + +
+ This paper proposes temporally aligned Large Language Models (LLMs) as a tool +for longitudinal analysis of social media data. We fine-tune Temporal Adapters +for Llama 3 8B on full timelines from a panel of British Twitter users, and +extract longitudinal aggregates of emotions and attitudes with established +questionnaires. We validate our estimates against representative British survey +data and find strong positive, significant correlations for several collective +emotions. The obtained estimates are robust across multiple training seeds and +prompt formulations, and in line with collective emotions extracted using a +traditional classification model trained on labeled data. To the best of our +knowledge, this is the first work to extend the analysis of affect in LLMs to a +longitudinal setting through Temporal Adapters. Our work enables new approaches +towards the longitudinal analysis of social media data. + +
+
+ comment: Code available at https://github.com/dess-mannheim/temporal-adapters +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Weak-To-Strong Backdoor Attacks for LLMs with Contrastive Knowledge + Distillation + + +
+ Despite being widely applied due to their exceptional capabilities, Large +Language Models (LLMs) have been proven to be vulnerable to backdoor attacks. +These attacks introduce targeted vulnerabilities into LLMs by poisoning +training samples and full-parameter fine-tuning. However, this kind of backdoor +attack is limited since they require significant computational resources, +especially as the size of LLMs increases. Besides, parameter-efficient +fine-tuning (PEFT) offers an alternative but the restricted parameter updating +may impede the alignment of triggers with target labels. In this study, we +first verify that backdoor attacks with PEFT may encounter challenges in +achieving feasible performance. To address these issues and improve the +effectiveness of backdoor attacks with PEFT, we propose a novel backdoor attack +algorithm from weak to strong based on contrastive knowledge distillation +(W2SAttack). Specifically, we poison small-scale language models through +full-parameter fine-tuning to serve as the teacher model. The teacher model +then covertly transfers the backdoor to the large-scale student model through +contrastive knowledge distillation, which employs PEFT. Theoretical analysis +reveals that W2SAttack has the potential to augment the effectiveness of +backdoor attacks. We demonstrate the superior performance of W2SAttack on +classification tasks across four language models, four backdoor attack +algorithms, and two different architectures of teacher models. Experimental +results indicate success rates close to 100% for backdoor attacks targeting +PEFT. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ The Lou Dataset -- Exploring the Impact of Gender-Fair Language in + German Text Classification + + +
+ Gender-fair language, an evolving German linguistic variation, fosters +inclusion by addressing all genders or using neutral forms. Nevertheless, there +is a significant lack of resources to assess the impact of this linguistic +shift on classification using language models (LMs), which are probably not +trained on such variations. To address this gap, we present Lou, the first +dataset featuring high-quality reformulations for German text classification +covering seven tasks, like stance detection and toxicity classification. +Evaluating 16 mono- and multi-lingual LMs on Lou shows that gender-fair +language substantially impacts predictions by flipping labels, reducing +certainty, and altering attention patterns. However, existing evaluations +remain valid, as LM rankings of original and reformulated instances do not +significantly differ. While we offer initial insights on the effect on German +text classification, the findings likely apply to other languages, as +consistent patterns were observed in multi-lingual and English LMs. + +
+
+
+
+
+ + ☆ Pioneering Reliable Assessment in Text-to-Image Knowledge Editing: + Leveraging a Fine-Grained Dataset and an Innovative Criterion EMNLP24 + + +
+ During pre-training, the Text-to-Image (T2I) diffusion models encode factual +knowledge into their parameters. These parameterized facts enable realistic +image generation, but they may become obsolete over time, thereby +misrepresenting the current state of the world. Knowledge editing techniques +aim to update model knowledge in a targeted way. However, facing the dual +challenges posed by inadequate editing datasets and unreliable evaluation +criterion, the development of T2I knowledge editing encounter difficulties in +effectively generalizing injected knowledge. In this work, we design a T2I +knowledge editing framework by comprehensively spanning on three phases: First, +we curate a dataset \textbf{CAKE}, comprising paraphrase and multi-object test, +to enable more fine-grained assessment on knowledge generalization. Second, we +propose a novel criterion, \textbf{adaptive CLIP threshold}, to effectively +filter out false successful images under the current criterion and achieve +reliable editing evaluation. Finally, we introduce \textbf{MPE}, a simple but +effective approach for T2I knowledge editing. Instead of tuning parameters, MPE +precisely recognizes and edits the outdated part of the conditioning +text-prompt to accommodate the up-to-date knowledge. A straightforward +implementation of MPE (Based on in-context learning) exhibits better overall +performance than previous model editors. We hope these efforts can further +promote faithful evaluation of T2I knowledge editing methods. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+ + ☆ Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan + Arabic Dialect + + +
+ We introduce Atlas-Chat, the first-ever collection of large language models +specifically developed for dialectal Arabic. Focusing on Moroccan Arabic, also +known as Darija, we construct our instruction dataset by consolidating existing +Darija language resources, creating novel datasets both manually and +synthetically, and translating English instructions with stringent quality +control. Atlas-Chat-9B and 2B models, fine-tuned on the dataset, exhibit +superior ability in following Darija instructions and performing standard NLP +tasks. Notably, our models outperform both state-of-the-art and +Arabic-specialized LLMs like LLaMa, Jais, and AceGPT, e.g., achieving a 13% +performance boost over a larger 13B model on DarijaMMLU, in our newly +introduced evaluation suite for Darija covering both discriminative and +generative tasks. Furthermore, we perform an experimental analysis of various +fine-tuning strategies and base model choices to determine optimal +configurations. All our resources are publicly accessible, and we believe our +work offers comprehensive design methodologies of instruction-tuning for +low-resource language variants, which are often neglected in favor of data-rich +languages by contemporary LLMs. + +
+
+
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language + Models + + +
+ In this work, we introduce EMMA-500, a large-scale multilingual language +model continue-trained on texts across 546 languages designed for enhanced +multilingual performance, focusing on improving language coverage for +low-resource languages. To facilitate continual pre-training, we compile the +MaLA corpus, a comprehensive multilingual dataset enriched with curated +datasets across diverse domains. Leveraging this corpus, we conduct extensive +continual pre-training of the Llama 2 7B model, resulting in EMMA-500, which +demonstrates robust performance across a wide collection of benchmarks, +including a comprehensive set of multilingual tasks and PolyWrite, an +open-ended generation benchmark developed in this study. Our results highlight +the effectiveness of continual pre-training in expanding large language models' +language capacity, particularly for underrepresented languages, demonstrating +significant gains in cross-lingual transfer, task generalization, and language +adaptability. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ PEDRO: Parameter-Efficient Fine-tuning with Prompt DEpenDent + Representation MOdification + + +
+ Due to their substantial sizes, large language models (LLMs) are typically +deployed within a single-backbone multi-tenant framework. In this setup, a +single instance of an LLM backbone must cater to multiple users or tasks +through the application of various parameter-efficient fine-tuning (PEFT) +models. Despite the availability of numerous effective PEFT techniques such as +LoRA, there remains a need for a PEFT approach that achieves both high +efficiency during inference and competitive performance on downstream tasks. In +this research, we introduce a new and straightforward PEFT methodology named +\underline{P}rompt D\underline{E}pen\underline{D}ent \underline{R}epresentation +M\underline{O}dification (PEDRO). The proposed method involves integrating a +lightweight vector generator into each Transformer layer, which generates +vectors contingent upon the input prompts. These vectors then modify the hidden +representations created by the LLM through a dot product operation, thereby +influencing the semantic output and generated content of the model. Extensive +experimentation across a variety of tasks indicates that: (a) PEDRO surpasses +recent PEFT benchmarks when using a similar number of tunable parameters. (b) +Under the single-backbone multi-tenant deployment model, PEDRO exhibits +superior efficiency compared to LoRA, indicating significant industrial +potential. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.18203 +
+
+
+
+
+ + ☆ BeanCounter: A low-toxicity, large-scale, and open dataset of + business-oriented text + + +
+ Many of the recent breakthroughs in language modeling have resulted from +scaling effectively the same model architecture to larger datasets. In this +vein, recent work has highlighted performance gains from increasing training +dataset size and quality, suggesting a need for novel sources of large-scale +datasets. In this work, we introduce BeanCounter, a public dataset consisting +of more than 159B tokens extracted from businesses' disclosures. We show that +this data is indeed novel: less than 0.1% of BeanCounter appears in Common +Crawl-based datasets and it is an order of magnitude larger than datasets +relying on similar sources. Given the data's provenance, we hypothesize that +BeanCounter is comparatively more factual and less toxic than web-based +datasets. Exploring this hypothesis, we find that many demographic identities +occur with similar prevalence in BeanCounter but with significantly less toxic +context relative to other datasets. To demonstrate the utility of BeanCounter, +we evaluate and compare two LLMs continually pre-trained on BeanCounter with +their base models. We find an 18-33% reduction in toxic generation and improved +performance within the finance domain for the continually pretrained models. +Collectively, our work suggests that BeanCounter is a novel source of +low-toxicity and high-quality domain-specific data with sufficient scale to +train multi-billion parameter LLMs. + +
+
+
+
+
+ + ☆ Inference-Time Language Model Alignment via Integrated Value Guidance EMNLP 2024 + + +
+ Large language models are typically fine-tuned to align with human +preferences, but tuning large models is computationally intensive and complex. +In this work, we introduce $\textit{Integrated Value Guidance}$ (IVG), a method +that uses implicit and explicit value functions to guide language model +decoding at token and chunk-level respectively, efficiently aligning large +language models purely at inference time. This approach circumvents the +complexities of direct fine-tuning and outperforms traditional methods. +Empirically, we demonstrate the versatility of IVG across various tasks. In +controlled sentiment generation and summarization tasks, our method +significantly improves the alignment of large models using inference-time +guidance from $\texttt{gpt2}$-based value functions. Moreover, in a more +challenging instruction-following benchmark AlpacaEval 2.0, we show that both +specifically tuned and off-the-shelf value functions greatly improve the +length-controlled win rates of large models against $\texttt{gpt-4-turbo}$ +(e.g., $19.51\% \rightarrow 26.51\%$ for $\texttt{Mistral-7B-Instruct-v0.2}$ +and $25.58\% \rightarrow 33.75\%$ for $\texttt{Mixtral-8x7B-Instruct-v0.1}$ +with Tulu guidance). + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Self-supervised Preference Optimization: Enhance Your Language Model + with Preference Degree Awareness EMNLP 2024 + + +
+ Recently, there has been significant interest in replacing the reward model +in Reinforcement Learning with Human Feedback (RLHF) methods for Large Language +Models (LLMs), such as Direct Preference Optimization (DPO) and its variants. +These approaches commonly use a binary cross-entropy mechanism on pairwise +samples, i.e., minimizing and maximizing the loss based on preferred or +dis-preferred responses, respectively. However, while this training strategy +omits the reward model, it also overlooks the varying preference degrees within +different responses. We hypothesize that this is a key factor hindering LLMs +from sufficiently understanding human preferences. To address this problem, we +propose a novel Self-supervised Preference Optimization (SPO) framework, which +constructs a self-supervised preference degree loss combined with the alignment +loss, thereby helping LLMs improve their ability to understand the degree of +preference. Extensive experiments are conducted on two widely used datasets of +different tasks. The results demonstrate that SPO can be seamlessly integrated +with existing preference optimization methods and significantly boost their +performance to achieve state-of-the-art performance. We also conduct detailed +analyses to offer comprehensive insights into SPO, which verifies its +effectiveness. The code is available at https://github.com/lijian16/SPO. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Faithfulness and the Notion of Adversarial Sensitivity in NLP + Explanations EMNLP 2024 + + +
+ Faithfulness is arguably the most critical metric to assess the reliability +of explainable AI. In NLP, current methods for faithfulness evaluation are +fraught with discrepancies and biases, often failing to capture the true +reasoning of models. We introduce Adversarial Sensitivity as a novel approach +to faithfulness evaluation, focusing on the explainer's response when the model +is under adversarial attack. Our method accounts for the faithfulness of +explainers by capturing sensitivity to adversarial input changes. This work +addresses significant limitations in existing evaluation techniques, and +furthermore, quantifies faithfulness from a crucial yet underexplored paradigm. + +
+
+ comment: Accepted as a Full Paper at EMNLP 2024 Workshop BlackBoxNLP +
+
+
+
+
+ + ☆ Integrating Hierarchical Semantic into Iterative Generation Model for + Entailment Tree Explanation + + +
+ Manifestly and logically displaying the line of reasoning from evidence to +answer is significant to explainable question answering (QA). The entailment +tree exhibits the lines structurally, which is different from the +self-explanation principle in large-scale language models. Existing methods +rarely consider the semantic association of sentences between and within +hierarchies within the tree structure, which is prone to apparent mistakes in +combinations. In this work, we propose an architecture of integrating the +Hierarchical Semantics of sentences under the framework of Controller-Generator +(HiSCG) to explain answers. The HiSCG designs a hierarchical mapping between +hypotheses and facts, discriminates the facts involved in tree constructions, +and optimizes single-step entailments. To the best of our knowledge, We are the +first to notice hierarchical semantics of sentences between the same layer and +adjacent layers to yield improvements. The proposed method achieves comparable +performance on all three settings of the EntailmentBank dataset. The +generalization results on two out-of-domain datasets also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ SECURE: Semantics-aware Embodied Conversation under Unawareness for + Lifelong Robot Learning + + +
+ This paper addresses a challenging interactive task learning scenario we call +rearrangement under unawareness: to manipulate a rigid-body environment in a +context where the robot is unaware of a concept that's key to solving the +instructed task. We propose SECURE, an interactive task learning framework +designed to solve such problems by fixing a deficient domain model using +embodied conversation. Through dialogue, the robot discovers and then learns to +exploit unforeseen possibilities. Using SECURE, the robot not only learns from +the user's corrective feedback when it makes a mistake, but it also learns to +make strategic dialogue decisions for revealing useful evidence about novel +concepts for solving the instructed task. Together, these abilities allow the +robot to generalise to subsequent tasks using newly acquired knowledge. We +demonstrate that a robot that is semantics-aware -- that is, it exploits the +logical consequences of both sentence and discourse semantics in the learning +and inference process -- learns to solve rearrangement under unawareness more +effectively than a robot that lacks such capabilities. + +
+
+ comment: 10 pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Are Transformers in Pre-trained LM A Good ASR Encoder? An Empirical + Study + + +
+ In this study, we delve into the efficacy of transformers within pre-trained +language models (PLMs) when repurposed as encoders for Automatic Speech +Recognition (ASR). Our underlying hypothesis posits that, despite being +initially trained on text-based corpora, these transformers possess a +remarkable capacity to extract effective features from the input sequence. This +inherent capability, we argue, is transferrable to speech data, thereby +augmenting the acoustic modeling ability of ASR. Through rigorous empirical +analysis, our findings reveal a notable improvement in Character Error Rate +(CER) and Word Error Rate (WER) across diverse ASR tasks when transformers from +pre-trained LMs are incorporated. Particularly, they serve as an advantageous +starting point for initializing ASR encoders. Furthermore, we uncover that +these transformers, when integrated into a well-established ASR encoder, can +significantly boost performance, especially in scenarios where profound +semantic comprehension is pivotal. This underscores the potential of leveraging +the semantic prowess embedded within pre-trained transformers to advance ASR +systems' capabilities. + +
+
+ comment: 8pages +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Zero- and Few-shot Named Entity Recognition and Text Expansion in + Medication Prescriptions using ChatGPT + + +
+ Introduction: Medication prescriptions are often in free text and include a +mix of two languages, local brand names, and a wide range of idiosyncratic +formats and abbreviations. Large language models (LLMs) have shown promising +ability to generate text in response to input prompts. We use ChatGPT 3.5 to +automatically structure and expand medication statements in discharge summaries +and thus make them easier to interpret for people and machines. Methods: +Named-entity Recognition (NER) and Text Expansion (EX) are used in a zero- and +few-shot setting with different prompt strategies. 100 medication statements +were manually annotated and curated. NER performance was measured by using +strict and partial matching. For the task EX, two experts interpreted the +results by assessing semantic equivalence between original and expanded +statements. The model performance was measured by precision, recall, and F1 +score. Results: For NER, the best-performing prompt reached an average F1 score +of 0.94 in the test set. For EX, the few-shot prompt showed superior +performance among other prompts, with an average F1 score of 0.87. Conclusion: +Our study demonstrates good performance for NER and EX tasks in free-text +medication statements using ChatGPT. Compared to a zero-shot baseline, a +few-shot approach prevented the system from hallucinating, which would be +unacceptable when processing safety-relevant medication data. + +
+
+
+
+
+ + ☆ Cross-lingual Human-Preference Alignment for Neural Machine Translation + with Direct Quality Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) and derivative techniques +like Direct Preference Optimization (DPO) are task-alignment algorithms used to +repurpose general, foundational models for specific tasks. We show that +applying task-alignment to neural machine translation (NMT) addresses an +existing task--data mismatch in NMT, leading to improvements across all +languages of a multilingual model, even when task-alignment is only applied to +a subset of those languages. We do so by introducing Direct Quality +Optimization (DQO), a variant of DPO leveraging a pre-trained translation +quality estimation model as a proxy for human preferences, and verify the +improvements with both automatic metrics and human evaluation. + +
+
+ comment: 17 pages, 1 figure +
+
+
+
+
+ + ☆ Digital Twin Ecosystem for Oncology Clinical Operations + + +
+ Artificial Intelligence (AI) and Large Language Models (LLMs) hold +significant promise in revolutionizing healthcare, especially in clinical +applications. Simultaneously, Digital Twin technology, which models and +simulates complex systems, has gained traction in enhancing patient care. +However, despite the advances in experimental clinical settings, the potential +of AI and digital twins to streamline clinical operations remains largely +untapped. This paper introduces a novel digital twin framework specifically +designed to enhance oncology clinical operations. We propose the integration of +multiple specialized digital twins, such as the Medical Necessity Twin, Care +Navigator Twin, and Clinical History Twin, to enhance workflow efficiency and +personalize care for each patient based on their unique data. Furthermore, by +synthesizing multiple data sources and aligning them with the National +Comprehensive Cancer Network (NCCN) guidelines, we create a dynamic Cancer Care +Path, a continuously evolving knowledge base that enables these digital twins +to provide precise, tailored clinical recommendations. + +
+
+ comment: Pre Print +
+
+
+
+
+ + ☆ Efficient In-Domain Question Answering for Resource-Constrained + Environments + + +
+ Retrieval Augmented Generation (RAG) is a common method for integrating +external knowledge into pretrained Large Language Models (LLMs) to enhance +accuracy and relevancy in question answering (QA) tasks. However, prompt +engineering and resource efficiency remain significant bottlenecks in +developing optimal and robust RAG solutions for real-world QA applications. +Recent studies have shown success in using fine tuning to address these +problems; in particular, Retrieval Augmented Fine Tuning (RAFT) applied to +smaller 7B models has demonstrated superior performance compared to RAG setups +with much larger models such as GPT-3.5. The combination of RAFT with +parameter-efficient fine tuning (PEFT) techniques, such as Low-Rank Adaptation +(LoRA), promises an even more efficient solution, yet remains an unexplored +area. In this work, we combine RAFT with LoRA to reduce fine tuning and storage +requirements and gain faster inference times while maintaining comparable RAG +performance. This results in a more compute-efficient RAFT, or CRAFT, which is +particularly useful for knowledge-intensive QA tasks in resource-constrained +environments where internet access may be restricted and hardware resources +limited. + +
+
+ comment: 6 pages, 2 tables +
+
+
+
+
+ + ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training + on an Assistant Task for a Target Task + + +
+ Long text summarization, gradually being essential for efficiently processing +large volumes of information, stays challenging for Large Language Models +(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced +training datasets and the high requirement of contextual details dealing. To +address the issue, we design a novel zero-shot transfer learning framework, +abbreviated as T3, to iteratively training a baseline LLM on an assistant task +for the target task, where the former should own richer data resources and +share structural or semantic similarity with the latter. In practice, T3 is +approached to deal with the long text summarization task by utilizing question +answering as the assistant task, and further validated its effectiveness on the +BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14% +improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore +compared to three baseline LLMs, demonstrating its potential for more +assistant-target task combinations. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Deep CLAS: Deep Contextual Listen, Attend and Spell SC 2022 + + +
+ Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech +Recognition (ASR) of rare words. It relies on phrase-level contextual modeling +and attention-based relevance scoring without explicit contextual constraint +which lead to insufficient use of contextual information. In this work, we +propose deep CLAS to use contextual information better. We introduce bias loss +forcing model to focus on contextual information. The query of bias attention +is also enriched to improve the accuracy of the bias attention score. To get +fine-grained contextual information, we replace phrase-level encoding with +character-level encoding and encode contextual information with conformer +rather than LSTM. Moreover, we directly use the bias attention score to correct +the output probability distribution of the model. Experiments using the public +AISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS +obtains a 65.78% relative recall and a 53.49% relative F1-score increase in the +named entity recognition scene. + +
+
+ comment: Accepted by NCMMSC 2022 +
+
+
+
+
+ + ☆ DualCoTs: Dual Chain-of-Thoughts Prompting for Sentiment Lexicon + Expansion of Idioms + + +
+ Idioms represent a ubiquitous vehicle for conveying sentiments in the realm +of everyday discourse, rendering the nuanced analysis of idiom sentiment +crucial for a comprehensive understanding of emotional expression within +real-world texts. Nevertheless, the existing corpora dedicated to idiom +sentiment analysis considerably limit research in text sentiment analysis. In +this paper, we propose an innovative approach to automatically expand the +sentiment lexicon for idioms, leveraging the capabilities of large language +models through the application of Chain-of-Thought prompting. To demonstrate +the effectiveness of this approach, we integrate multiple existing resources +and construct an emotional idiom lexicon expansion dataset (called EmoIdiomE), +which encompasses a comprehensive repository of Chinese and English idioms. +Then we designed the Dual Chain-of-Thoughts (DualCoTs) method, which combines +insights from linguistics and psycholinguistics, to demonstrate the +effectiveness of using large models to automatically expand the sentiment +lexicon for idioms. Experiments show that DualCoTs is effective in idioms +sentiment lexicon expansion in both Chinese and English. For reproducibility, +we will release the data and code upon acceptance. + +
+
+
+
+
+ + ☆ Leveraging Annotator Disagreement for Text Classification + + +
+ It is common practice in text classification to only use one majority label +for model training even if a dataset has been annotated by multiple annotators. +Doing so can remove valuable nuances and diverse perspectives inherent in the +annotators' assessments. This paper proposes and compares three different +strategies to leverage annotator disagreement for text classification: a +probability-based multi-label method, an ensemble system, and instruction +tuning. All three approaches are evaluated on the tasks of hate speech and +abusive conversation detection, which inherently entail a high degree of +subjectivity. Moreover, to evaluate the effectiveness of embracing annotation +disagreements for model training, we conduct an online survey that compares the +performance of the multi-label model against a baseline model, which is trained +with the majority label. + The results show that in hate speech detection, the multi-label method +outperforms the other two approaches, while in abusive conversation detection, +instruction tuning achieves the best performance. The results of the survey +also show that the outputs from the multi-label models are considered a better +representation of the texts than the single-label model. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in + Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities across +various tasks but their performance in complex logical reasoning tasks remains +unsatisfactory. Although some prompting methods, such as Chain-of-Thought, can +improve the reasoning ability of LLMs to some extent, they suffer from an +unfaithful issue where derived conclusions may not align with the generated +reasoning chain. To address this issue, some studies employ the approach of +propositional logic to further enhance logical reasoning abilities of LLMs. +However, the potential omissions in the extraction of logical expressions in +these methods can cause information loss in the logical reasoning process, +thereby generating incorrect results. To this end, we propose Logic-of-Thought +(LoT) prompting which employs propositional logic to generate expanded logical +information from input context, and utilizes the generated logical information +as an additional augmentation to the input prompts, thereby enhancing the +capability of logical reasoning. The LoT is orthogonal to existing prompting +methods and can be seamlessly integrated with them. Extensive experiments +demonstrate that LoT boosts the performance of various prompting methods with a +striking margin across five logical reasoning tasks. In particular, the LoT +enhances Chain-of-Thought's performance on the ReClor dataset by +4.35%; +moreover, it improves Chain-of-Thought with Self-Consistency's performance on +LogiQA by +5%; additionally, it boosts performance of Tree-of-Thoughts on +ProofWriter dataset by +8%. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ MUSE: Integrating Multi-Knowledge for Knowledge Graph Completion + + +
+ Knowledge Graph Completion (KGC) aims to predict the missing [relation] part +of (head entity)--[relation]->(tail entity) triplet. Most existing KGC methods +focus on single features (e.g., relation types) or sub-graph aggregation. +However, they do not fully explore the Knowledge Graph (KG) features and +neglect the guidance of external semantic knowledge. To address these +shortcomings, we propose a knowledge-aware reasoning model (MUSE), which +designs a novel multi-knowledge representation learning mechanism for missing +relation prediction. Our model develops a tailored embedding space through +three parallel components: 1) Prior Knowledge Learning for enhancing the +triplets' semantic representation by fine-tuning BERT; 2) Context Message +Passing for enhancing the context messages of KG; 3) Relational Path +Aggregation for enhancing the path representation from the head entity to the +tail entity. The experimental results show that MUSE significantly outperforms +other baselines on four public datasets, achieving over 5.50% H@1 improvement +and 4.20% MRR improvement on the NELL995 dataset. The code and datasets will be +released via https://github.com/SUSTech-TP/ADMA2024-MUSE.git. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2408.05283 +
+
+
+
+
+ + ☆ Data Proportion Detection for Optimized Data Management for Large + Language Models + + +
+ Large language models (LLMs) have demonstrated exceptional performance across +a wide range of tasks and domains, with data preparation playing a critical +role in achieving these results. Pre-training data typically combines +information from multiple domains. To maximize performance when integrating +data from various domains, determining the optimal data proportion is +essential. However, state-of-the-art (SOTA) LLMs rarely disclose details about +their pre-training data, making it difficult for researchers to identify ideal +data proportions. In this paper, we introduce a new topic, \textit{data +proportion detection}, which enables the automatic estimation of pre-training +data proportions by analyzing the generated outputs of LLMs. We provide +rigorous theoretical proofs, practical algorithms, and preliminary experimental +results for data proportion detection. Based on these findings, we offer +valuable insights into the challenges and future directions for effective data +proportion detection and data management. + +
+
+
+
+
+ + ☆ When A Man Says He Is Pregnant: ERP Evidence for A Rational Account of + Speaker-contextualized Language Comprehension + + +
+ Spoken language is often, if not always, understood in a context that +includes the identities of speakers. For instance, we can easily make sense of +an utterance such as "I'm going to have a manicure this weekend" or "The first +time I got pregnant I had a hard time" when the utterance is spoken by a woman, +but it would be harder to understand when it is spoken by a man. Previous +event-related potential (ERP) studies have shown mixed results regarding the +neurophysiological responses to such speaker-mismatched utterances, with some +reporting an N400 effect and others a P600 effect. In an experiment involving +64 participants, we showed that these different ERP effects reflect distinct +cognitive processes employed to resolve the speaker-message mismatch. When +possible, the message is integrated with the speaker context to arrive at an +interpretation, as in the case of violations of social stereotypes (e.g., men +getting a manicure), resulting in an N400 effect. However, when such +integration is impossible due to violations of biological knowledge (e.g., men +getting pregnant), listeners engage in an error correction process to revise +either the perceived utterance or the speaker context, resulting in a P600 +effect. Additionally, we found that the social N400 effect decreased as a +function of the listener's personality trait of openness, while the biological +P600 effect remained robust. Our findings help to reconcile the empirical +inconsistencies in the literature and provide a rational account of +speaker-contextualized language comprehension. + +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models NeurIPS 2024 + + +
+ Large Language Models (LLMs) are distinguished by their massive parameter +counts, which typically result in significant redundancy. This work introduces +MaskLLM, a learnable pruning method that establishes Semi-structured (or +``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during +inference. Instead of developing a new importance criterion, MaskLLM explicitly +models N:M patterns as a learnable distribution through Gumbel Softmax +sampling. This approach facilitates end-to-end training on large-scale datasets +and offers two notable advantages: 1) High-quality Masks - our method +effectively scales to large datasets and learns accurate masks; 2) +Transferability - the probabilistic modeling of mask distribution enables the +transfer learning of sparsity across domains or tasks. We assessed MaskLLM +using 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3, +with sizes ranging from 843M to 15B parameters, and our empirical results show +substantial improvements over state-of-the-art methods. For instance, leading +approaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to +the dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL +solely by learning the masks with frozen weights. Furthermore, MaskLLM's +learnable nature allows customized masks for lossless application of 2:4 +sparsity to downstream tasks or domains. Code is available at +\url{https://github.com/NVlabs/MaskLLM}. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Reducing and Exploiting Data Augmentation Noise through Meta Reweighting + Contrastive Learning for Text Classification + + +
+ Data augmentation has shown its effectiveness in resolving the data-hungry +problem and improving model's generalization ability. However, the quality of +augmented data can be varied, especially compared with the raw/original data. +To boost deep learning models' performance given augmented data/samples in text +classification tasks, we propose a novel framework, which leverages both meta +learning and contrastive learning techniques as parts of our design for +reweighting the augmented samples and refining their feature representations +based on their quality. As part of the framework, we propose novel +weight-dependent enqueue and dequeue algorithms to utilize augmented samples' +weight/quality information effectively. Through experiments, we show that our +framework can reasonably cooperate with existing deep learning models (e.g., +RoBERTa-base and Text-CNN) and augmentation techniques (e.g., Wordnet and +Easydata) for specific supervised learning tasks. Experiment results show that +our framework achieves an average of 1.6%, up to 4.3% absolute improvement on +Text-CNN encoders and an average of 1.4%, up to 4.4% absolute improvement on +RoBERTa-base encoders on seven GLUE benchmark datasets compared with the best +baseline. We present an indepth analysis of our framework design, revealing the +non-trivial contributions of our network components. Our code is publicly +available for better reproducibility. + +
+
+ comment: IEEE BigData 2021 +
+
+
+
+
+ + ☆ Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with + Scoring-aware Multiple Rewards EMNLP 2024 + + +
+ Recent advances in automated essay scoring (AES) have shifted towards +evaluating multiple traits to provide enriched feedback. Like typical AES +systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure +agreement with human raters, aligning closely with the rating schema; however, +its non-differentiable nature prevents its direct use in neural network +training. In this paper, we propose Scoring-aware Multi-reward Reinforcement +Learning (SaMRL), which integrates actual evaluation schemes into the training +process by designing QWK-based rewards with a mean-squared error penalty for +multi-trait AES. Existing reinforcement learning (RL) applications in AES are +limited to classification models despite associated performance degradation, as +RL requires probability distributions; instead, we adopt an autoregressive +score generation framework to leverage token generation probabilities for +robust multi-trait score predictions. Empirical analyses demonstrate that SaMRL +facilitates model training, notably enhancing scoring of previously inferior +prompts. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ What is the social benefit of hate speech detection research? A + Systematic Review + + +
+ While NLP research into hate speech detection has grown exponentially in the +last three decades, there has been minimal uptake or engagement from policy +makers and non-profit organisations. We argue the absence of ethical frameworks +have contributed to this rift between current practice and best practice. By +adopting appropriate ethical frameworks, NLP researchers may enable the social +impact potential of hate speech research. This position paper is informed by +reviewing forty-eight hate speech detection systems associated with +thirty-seven publications from different venues. + +
+
+ comment: Accepted to the 3rd Workshop on NLP for Positive Impact +
+
+
+
+
+ + ☆ RED QUEEN: Safeguarding Large Language Models against Concealed + Multi-Turn Jailbreaking + + +
+ The rapid progress of Large Language Models (LLMs) has opened up new +opportunities across various domains and applications; yet it also presents +challenges related to potential misuse. To mitigate such risks, red teaming has +been employed as a proactive security measure to probe language models for +harmful outputs via jailbreak attacks. However, current jailbreak attack +approaches are single-turn with explicit malicious queries that do not fully +capture the complexity of real-world interactions. In reality, users can engage +in multi-turn interactions with LLM-based chat assistants, allowing them to +conceal their true intentions in a more covert manner. To bridge this gap, we, +first, propose a new jailbreak approach, RED QUEEN ATTACK. This method +constructs a multi-turn scenario, concealing the malicious intent under the +guise of preventing harm. We craft 40 scenarios that vary in turns and select +14 harmful categories to generate 56k multi-turn attack data points. We conduct +comprehensive experiments on the RED QUEEN ATTACK with four representative LLM +families of different sizes. Our experiments reveal that all LLMs are +vulnerable to RED QUEEN ATTACK, reaching 87.62% attack success rate on GPT-4o +and 75.4% on Llama3-70B. Further analysis reveals that larger models are more +susceptible to the RED QUEEN ATTACK, with multi-turn structures and concealment +strategies contributing to its success. To prioritize safety, we introduce a +straightforward mitigation strategy called RED QUEEN GUARD, which aligns LLMs +to effectively counter adversarial attacks. This approach reduces the attack +success rate to below 1% while maintaining the model's performance across +standard benchmarks. Full implementation and dataset are publicly accessible at +https://github.com/kriti-hippo/red_queen. + +
+
+
+
+
+ + ☆ Navigating the Shortcut Maze: A Comprehensive Analysis of Shortcut + Learning in Text Classification by Language Models + + +
+ Language models (LMs), despite their advances, often depend on spurious +correlations, undermining their accuracy and generalizability. This study +addresses the overlooked impact of subtler, more complex shortcuts that +compromise model reliability beyond oversimplified shortcuts. We introduce a +comprehensive benchmark that categorizes shortcuts into occurrence, style, and +concept, aiming to explore the nuanced ways in which these shortcuts influence +the performance of LMs. Through extensive experiments across traditional LMs, +large language models, and state-of-the-art robust models, our research +systematically investigates models' resilience and susceptibilities to +sophisticated shortcuts. Our benchmark and code can be found at: +https://github.com/yuqing-zhou/shortcut-learning-in-text-classification. + +
+
+
+
+
+ + ☆ Description-based Controllable Text-to-Speech with Cross-Lingual Voice + Control ICASSP 2025 + + +
+ We propose a novel description-based controllable text-to-speech (TTS) method +with cross-lingual control capability. To address the lack of audio-description +paired data in the target language, we combine a TTS model trained on the +target language with a description control model trained on another language, +which maps input text descriptions to the conditional features of the TTS +model. These two models share disentangled timbre and style representations +based on self-supervised learning (SSL), allowing for disentangled voice +control, such as controlling speaking styles while retaining the original +timbre. Furthermore, because the SSL-based timbre and style representations are +language-agnostic, combining the TTS and description control models while +sharing the same embedding space effectively enables cross-lingual control of +voice characteristics. Experiments on English and Japanese TTS demonstrate that +our method achieves high naturalness and controllability for both languages, +even though no Japanese audio-description pairs are used. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Enhancing Financial Sentiment Analysis with Expert-Designed Hint + + +
+ This paper investigates the role of expert-designed hint in enhancing +sentiment analysis on financial social media posts. We explore the capability +of large language models (LLMs) to empathize with writer perspectives and +analyze sentiments. Our findings reveal that expert-designed hint, i.e., +pointing out the importance of numbers, significantly improve performances +across various LLMs, particularly in cases requiring perspective-taking skills. +Further analysis on tweets containing different types of numerical data +demonstrates that the inclusion of expert-designed hint leads to notable +improvements in sentiment analysis performance, especially for tweets with +monetary-related numbers. Our findings contribute to the ongoing discussion on +the applicability of Theory of Mind in NLP and open new avenues for improving +sentiment analysis in financial domains through the strategic use of expert +knowledge. + +
+
+
+
+
+ + ♻ ☆ Is It Good Data for Multilingual Instruction Tuning or Just Bad + Multilingual Evaluation for Large Language Models? EMNLP 2024 + + +
+ Multilingual large language models are designed, claimed, and expected to +cater to speakers of varied languages. We hypothesise that the current +practices of fine-tuning and evaluating these models may not perfectly align +with this objective owing to a heavy reliance on translation, which cannot +cover language-specific knowledge but can introduce translation defects. It +remains unknown whether the nature of the instruction data has an impact on the +model output; conversely, it is questionable whether translated test sets can +capture such nuances. Due to the often coupled practices of using translated +data in both stages, such imperfections could have been overlooked. This work +investigates these issues using controlled native or translated data during the +instruction tuning and evaluation stages. We show that native or generation +benchmarks reveal a notable difference between native and translated +instruction data especially when model performance is high, whereas other types +of test sets cannot. The comparison between round-trip and single-pass +translations reflects the importance of knowledge from language-native +resources. Finally, we demonstrate that regularization is beneficial to +bridging this gap on structured but not generative tasks. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ KAG: Boosting LLMs in Professional Domains via Knowledge Augmented + Generation + + +
+ The recently developed retrieval-augmented generation (RAG) technology has +enabled the efficient construction of domain-specific applications. However, it +also has limitations, including the gap between vector similarity and the +relevance of knowledge reasoning, as well as insensitivity to knowledge logic, +such as numerical values, temporal relations, expert rules, and others, which +hinder the effectiveness of professional knowledge services. In this work, we +introduce a professional domain knowledge service framework called Knowledge +Augmented Generation (KAG). KAG is designed to address the aforementioned +challenges with the motivation of making full use of the advantages of +knowledge graph(KG) and vector retrieval, and to improve generation and +reasoning performance by bidirectionally enhancing large language models (LLMs) +and KGs through five key aspects: (1) LLM-friendly knowledge representation, +(2) mutual-indexing between knowledge graphs and original chunks, (3) +logical-form-guided hybrid reasoning engine, (4) knowledge alignment with +semantic reasoning, and (5) model capability enhancement for KAG. We compared +KAG with existing RAG methods in multihop question answering and found that it +significantly outperforms state-of-theart methods, achieving a relative +improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We +have successfully applied KAG to two professional knowledge Q&A tasks of Ant +Group, including E-Government Q&A and E-Health Q&A, achieving significant +improvement in professionalism compared to RAG methods. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ Recent Trends in Unsupervised Summarization + + +
+ Unsupervised summarization is a powerful technique that enables training +summarizing models without requiring labeled datasets. This survey covers +different recent techniques and models used for unsupervised summarization. We +cover extractive, abstractive, and hybrid models and strategies used to achieve +unsupervised summarization. While the main focus of this survey is on recent +research, we also cover some of the important previous research. We +additionally introduce a taxonomy, classifying different research based on +their approach to unsupervised training. Finally, we discuss the current +approaches and mention some datasets and evaluation methods. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Granularity is crucial when applying differential privacy to text: An + investigation for neural machine translation EMNLP + + +
+ Applying differential privacy (DP) by means of the DP-SGD algorithm to +protect individual data points during training is becoming increasingly popular +in NLP. However, the choice of granularity at which DP is applied is often +neglected. For example, neural machine translation (NMT) typically operates on +the sentence-level granularity. From the perspective of DP, this setup assumes +that each sentence belongs to a single person and any two sentences in the +training dataset are independent. This assumption is however violated in many +real-world NMT datasets, e.g., those including dialogues. For proper +application of DP we thus must shift from sentences to entire documents. In +this paper, we investigate NMT at both the sentence and document levels, +analyzing the privacy/utility trade-off for both scenarios, and evaluating the +risks of not using the appropriate privacy granularity in terms of leaking +personally identifiable information (PII). Our findings indicate that the +document-level NMT system is more resistant to membership inference attacks, +emphasizing the significance of using the appropriate granularity when working +with DP. + +
+
+ comment: Accepted at EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Transformers, Contextualism, and Polysemy + + +
+ The transformer architecture, introduced by Vaswani et al. (2017), is at the +heart of the remarkable recent progress in the development of language models, +including widely-used chatbots such as Chat-GPT and Claude. In this paper, I +argue that we can extract from the way the transformer architecture works a +theory of the relationship between context and meaning. I call this the +transformer theory, and I argue that it is novel with regard to two related +philosophical debates: the contextualism debate regarding the extent of +context-sensitivity across natural language, and the polysemy debate regarding +how polysemy should be captured within an account of word meaning. + +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ AC4: Algebraic Computation Checker for Circuit Constraints in ZKPs + + +
+ Zero-knowledge proof (ZKP) systems have surged attention and held a +fundamental role in contemporary cryptography. Zero-knowledge succinct +non-interactive argument of knowledge (zk-SNARK) protocols dominate the ZKP +usage, implemented through arithmetic circuit programming paradigm. However, +underconstrained or overconstrained circuits may lead to bugs. The former +refers to circuits that lack the necessary constraints, resulting in unexpected +solutions and causing the verifier to accept a bogus witness, and the latter +refers to circuits that are constrained excessively, resulting in lacking +necessary solutions and causing the verifier to accept no witness. This paper +introduces a novel approach for pinpointing two distinct types of bugs in ZKP +circuits. The method involves encoding the arithmetic circuit constraints to +polynomial equation systems and solving them over finite fields by the computer +algebra system. The classification of verification results is refined, greatly +enhancing the expressive power of the system. A tool, AC4, is proposed to +represent the implementation of the method. Experiments show that AC4 +demonstrates a increase in the checked ratio, showing a 29% improvement over +Picus, a checker for Circom circuits, and a 10% improvement over +halo2-analyzer, a checker for halo2 circuits. Within a solvable range, the +checking time has also exhibited noticeable improvement, demonstrating a +magnitude increase compared to previous efforts. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music + Scores for All Singing Tasks NeurIPS 2024 + + +
+ The scarcity of high-quality and multi-task singing datasets significantly +hinders the development of diverse controllable and personalized singing tasks, +as existing singing datasets suffer from low quality, limited diversity of +languages and singers, absence of multi-technique information and realistic +music scores, and poor task suitability. To tackle these problems, we present +GTSinger, a large global, multi-technique, free-to-use, high-quality singing +corpus with realistic music scores, designed for all singing tasks, along with +its benchmarks. Particularly, (1) we collect 80.59 hours of high-quality +singing voices, forming the largest recorded singing dataset; (2) 20 +professional singers across nine widely spoken languages offer diverse timbres +and styles; (3) we provide controlled comparison and phoneme-level annotations +of six commonly used singing techniques, helping technique modeling and +control; (4) GTSinger offers realistic music scores, assisting real-world +musical composition; (5) singing voices are accompanied by manual +phoneme-to-audio alignments, global style labels, and 16.16 hours of paired +speech for various singing tasks. Moreover, to facilitate the use of GTSinger, +we conduct four benchmark experiments: technique-controllable singing voice +synthesis, technique recognition, style transfer, and speech-to-singing +conversion. The corpus and demos can be found at http://gtsinger.github.io. We +provide the dataset and the code for processing data and conducting benchmarks +at https://huggingface.co/datasets/GTSinger/GTSinger and +https://github.com/GTSinger/GTSinger. + +
+
+ comment: Accepted by NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ EfficientRAG: Efficient Retriever for Multi-Hop Question Answering + + +
+ Retrieval-augmented generation (RAG) methods encounter difficulties when +addressing complex questions like multi-hop queries. While iterative retrieval +methods improve performance by gathering additional information, current +approaches often rely on multiple calls of large language models (LLMs). In +this paper, we introduce EfficientRAG, an efficient retriever for multi-hop +question answering. EfficientRAG iteratively generates new queries without the +need for LLM calls at each iteration and filters out irrelevant information. +Experimental results demonstrate that EfficientRAG surpasses existing RAG +methods on three open-domain multi-hop question-answering datasets. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient + Language Model Inference EMNLP 2024 + + +
+ The development of state-of-the-art generative large language models (LLMs) +disproportionately relies on English-centric tokenizers, vocabulary and +pre-training data. Despite the fact that some LLMs have multilingual +capabilities, recent studies have shown that their inference efficiency +deteriorates when generating text in languages other than English. This results +in increased inference time and costs. Cross-lingual vocabulary adaptation +(CVA) methods have been proposed for adapting models to a target language +aiming to improve downstream performance. However, the effectiveness of these +methods on increasing inference efficiency of generative LLMs has yet to be +explored. In this paper, we perform an empirical study of five CVA methods on +four generative LLMs (including monolingual and multilingual models) across +four typologically-diverse languages and four natural language understanding +tasks. We find that CVA substantially contributes to LLM inference speedups of +up to 271.5\%. We also show that adapting LLMs that have been pre-trained on +more balanced multilingual data results in downstream performance comparable to +the original models. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Abstraction-of-Thought Makes Language Models Better Reasoners EMNLP 2024 + + +
+ Abstract reasoning, the ability to reason from the abstract essence of a +problem, serves as a key to generalization in human reasoning. However, +eliciting language models to perform reasoning with abstraction remains +unexplored. This paper seeks to bridge this gap by introducing a novel +structured reasoning format called Abstraction-of-Thought (AoT). The uniqueness +of AoT lies in its explicit requirement for varying levels of abstraction +within the reasoning process. This approach could elicit language models to +first contemplate on the abstract level before incorporating concrete details, +which is overlooked by the prevailing step-by-step Chain-of-Thought (CoT) +method. To align models with the AoT format, we present AoT Collection, a +generic finetuning dataset consisting of 348k high-quality samples with AoT +reasoning processes, collected via an automated and scalable pipeline. We +finetune a wide range of language models with AoT Collection and conduct +extensive evaluations on 23 unseen tasks from the challenging benchmark +Big-Bench Hard. Experimental results indicate that models aligned to AoT +reasoning format substantially outperform those aligned to CoT in many +reasoning tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ DAPE: Data-Adaptive Positional Encoding for Length Extrapolation NeurIPS 2024 + + +
+ Positional encoding plays a crucial role in transformers, significantly +impacting model performance and length generalization. Prior research has +introduced absolute positional encoding (APE) and relative positional encoding +(RPE) to distinguish token positions in given sequences. However, both APE and +RPE remain fixed after model training regardless of input data, limiting their +adaptability and flexibility. Hence, we expect that the desired positional +encoding should be data-adaptive and can be dynamically adjusted with the given +attention. In this paper, we propose a Data-Adaptive Positional Encoding (DAPE) +method, which dynamically and semantically adjusts based on input context and +learned fixed priors. Experimental validation on real-world datasets (Arxiv, +Books3, and CHE) demonstrates that DAPE enhances model performances in terms of +trained length and length generalization, where the improvements are +statistically significant. The model visualization suggests that our model can +keep both local and anti-local information. Finally, we successfully train the +model on sequence length 128 and achieve better performance at evaluation +sequence length 8192, compared with other static positional encoding methods, +revealing the benefit of the adaptive positional encoding method. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ Fine Tuning vs. Retrieval Augmented Generation for Less Popular + Knowledge + + +
+ Language Models (LMs) memorize a vast amount of factual knowledge, exhibiting +strong performance across diverse tasks and domains. However, it has been +observed that the performance diminishes when dealing with less-popular or +low-frequency concepts and entities, for example in domain specific +applications. The two prominent approaches to enhance the performance of LMs on +low-frequent topics are: Retrieval Augmented Generation (RAG) and fine-tuning +(FT) over synthetic data. This paper explores and evaluates the impact of RAG +and FT on customizing LMs in handling low-frequency entities on question +answering tasks. We conduct extensive experiments on twelve LMs of varying size +and type and different fine tuning, data augmentation, and retrieval models. +Our findings indicate that while FT boosts the performance across entities of +varying popularity, RAG surpasses FT by a large margin particularly for least +popular factual knowledge. Additionally, the success of both RAG and FT +approaches is amplified by improving retrieval and data augmentation +techniques. Fine tuning, while beneficial for small LMs, requires extensive +resources. To address this issue, we propose the new Stimulus RAG approach that +surpasses the effectiveness of fine tuning based approaches, thereby +eliminating the need for the costly data augmentation and fine tuning step for +enriching LMs with less popular factual knowledge. + +
+
+
+
+
+ + ♻ ☆ J2N -- Nominal Adjective Identification and its Application + + +
+ This paper explores the challenges posed by nominal adjectives (NAs) in +natural language processing (NLP) tasks, particularly in part-of-speech (POS) +tagging. We propose treating NAs as a distinct POS tag, "JN," and investigate +its impact on POS tagging, BIO chunking, and coreference resolution. Our study +shows that reclassifying NAs can improve the accuracy of syntactic analysis and +structural understanding in NLP. We present experimental results using Hidden +Markov Models (HMMs), Maximum Entropy (MaxEnt) models, and Spacy, demonstrating +the feasibility and potential benefits of this approach. Additionally we +trained a bert model to identify the NA in untagged text. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NumeroLogic: Number Encoding for Enhanced LLMs' Numerical Reasoning + + +
+ Language models struggle with handling numerical data and performing +arithmetic operations. We hypothesize that this limitation can be partially +attributed to non-intuitive textual numbers representation. When a digit is +read or generated by a causal language model it does not know its place value +(e.g. thousands vs. hundreds) until the entire number is processed. To address +this issue, we propose a simple adjustment to how numbers are represented by +including the count of digits before each number. For instance, instead of +"42", we suggest using "{2:42}" as the new format. This approach, which we term +NumeroLogic, offers an added advantage in number generation by serving as a +Chain of Thought (CoT). By requiring the model to consider the number of digits +first, it enhances the reasoning process before generating the actual number. +We use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic +formatting. We further demonstrate NumeroLogic applicability to general natural +language modeling, improving language understanding performance in the MMLU +benchmark. + +
+
+
+
+
+ + ♻ ☆ Leveraging summary of radiology reports with transformers + + +
+ Two fundamental problems in health-care stem from patient handoff and triage. +Doctors are often required to perform complex findings summarization to +facilitate efficient communication with specialists and decision making on the +urgency of each case. To address these challenges, we present a state of the +art radiology report summarization model utilizing adjusted bidirectional +encoder representation from transformers BERTtoBERT encoder and decoder +architecture. We also provide a data processing pipeline for future models +developed on the the MIMIC CXR dataset. Our approach includes a novel method +for augmenting medical data and a comprehensive performance analysis. Our best +performing model achieved a recall oriented understudy for gisting evaluation L +F1 score of 58.75/100, outperforming specialized checkpoints with more +sophisticated attention mechanisms. We also provide a data processing pipeline +for future models developed on the MIMIC chest X-ray dataset. The model +introduced in this paper demonstrates significantly improved capacity in +radiology report summarization, highlighting the potential for ensuring better +clinical workflows and enhanced patient care. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ QRMeM: Unleash the Length Limitation through Question then Reflection + Memory Mechanism EMNLP 2024 + + +
+ While large language models (LLMs) have made notable advancements in natural +language processing, they continue to struggle with processing extensive text. +Memory mechanism offers a flexible solution for managing long contexts, +utilizing techniques such as compression, summarization, and structuring to +facilitate nuanced and efficient handling of large volumes of text. However, +existing techniques face challenges with static knowledge integration, leading +to insufficient adaptation to task-specific needs and missing +multi-segmentation relationships, which hinders the dynamic reorganization and +logical combination of relevant segments during the response process. To +address these issues, we introduce a novel strategy, Question then Reflection +Memory Mechanism (QRMeM), incorporating a dual-structured memory pool. This +pool synergizes static textual content with structured graph guidance, +fostering a reflective trial-and-error approach for navigating and identifying +relevant segments. Our evaluation across multiple-choice questions (MCQ) and +multi-document question answering (Multi-doc QA) benchmarks showcases QRMeM +enhanced performance compared to existing approaches. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Explanation Regularisation through the Lens of Attributions + + +
+ Explanation regularisation (ER) has been introduced as a way to guide text +classifiers to form their predictions relying on input tokens that humans +consider plausible. This is achieved by introducing an auxiliary explanation +loss that measures how well the output of an input attribution technique for +the model agrees with human-annotated rationales. The guidance appears to +benefit performance in out-of-domain (OOD) settings, presumably due to an +increased reliance on "plausible" tokens. However, previous work has +under-explored the impact of guidance on that reliance, particularly when +reliance is measured using attribution techniques different from those used to +guide the model. In this work, we seek to close this gap, and also explore the +relationship between reliance on plausible features and OOD performance. We +find that the connection between ER and the ability of a classifier to rely on +plausible features has been overstated and that a stronger reliance on +plausible tokens does not seem to be the cause for OOD improvements. + +
+
+ comment: 22 pages, 14 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AutoScraper: A Progressive Understanding Web Agent for Web Scraper + Generation EMNLP 2024 + + +
+ Web scraping is a powerful technique that extracts data from websites, +enabling automated data collection, enhancing data analysis capabilities, and +minimizing manual data entry efforts. Existing methods, wrappers-based methods +suffer from limited adaptability and scalability when faced with a new website, +while language agents, empowered by large language models (LLMs), exhibit poor +reusability in diverse web environments. In this work, we introduce the +paradigm of generating web scrapers with LLMs and propose AutoScraper, a +two-stage framework that can handle diverse and changing web environments more +efficiently. AutoScraper leverages the hierarchical structure of HTML and +similarity across different web pages for generating web scrapers. Besides, we +propose a new executability metric for better measuring the performance of web +scraper generation tasks. We conduct comprehensive experiments with multiple +LLMs and demonstrate the effectiveness of our framework. Resources of this +paper can be found at \url{https://github.com/EZ-hwh/AutoScraper} + +
+
+ comment: 19 pages, 4 figures, 18 tables. Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Faithfully Express Their Intrinsic Uncertainty + in Words? EMNLP 2024 + + +
+ We posit that large language models (LLMs) should be capable of expressing +their intrinsic uncertainty in natural language. For example, if the LLM is +equally likely to output two contradicting answers to the same question, then +its generated response should reflect this uncertainty by hedging its answer +(e.g., "I'm not sure, but I think..."). We formalize faithful response +uncertainty based on the gap between the model's intrinsic confidence in the +assertions it makes and the decisiveness by which they are conveyed. This +example-level metric reliably indicates whether the model reflects its +uncertainty, as it penalizes both excessive and insufficient hedging. We +evaluate a variety of aligned LLMs at faithfully communicating uncertainty on +several knowledge-intensive question answering tasks. Our results provide +strong evidence that modern LLMs are poor at faithfully conveying their +uncertainty, and that better alignment is necessary to improve their +trustworthiness. + +
+
+ comment: To appear in EMNLP 2024 (main conference) +
+
+
+
+
+ + ♻ ☆ Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs EMNLP2024 + + +
+ Although human evaluation remains the gold standard for open-domain dialogue +evaluation, the growing popularity of automated evaluation using Large Language +Models (LLMs) has also extended to dialogue. However, most frameworks leverage +benchmarks that assess older chatbots on aspects such as fluency and relevance, +which are not reflective of the challenges associated with contemporary models. +In fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset, +suggests that current chatbots may exhibit several recurring issues related to +coherence and commonsense knowledge, but generally produce highly fluent and +relevant responses. + Noting the aforementioned limitations, this paper introduces Soda-Eval, an +annotated dataset based on Soda that covers over 120K turn-level assessments +across 10K dialogues, where the annotations were generated by GPT-4. Using +Soda-Eval as a benchmark, we then study the performance of several open-access +instruction-tuned LLMs, finding that dialogue evaluation remains challenging. +Fine-tuning these models improves performance over few-shot inferences, both in +terms of correlation and explanation. + +
+
+ comment: Accepted to EMNLP2024 (findings) +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Integration Brings Causal and Reliable Reasoning Proofs + + +
+ Two lines of approaches are adopted for complex reasoning with LLMs. One line +of work prompts LLMs with various reasoning structures, while the structural +outputs can be naturally regarded as intermediate reasoning steps. Another line +of work adopt LLM-free declarative solvers to do the reasoning task, rendering +higher reasoning accuracy but lacking interpretability due to the black-box +nature of the solvers. Aiming to resolve the trade-off between answer accuracy +and interpretability, we present a simple extension to the latter line of work. +Specifically, we showcase that the intermediate search logs generated by Prolog +interpreters can be accessed and interpreted into human-readable reasoning +proofs. As long as LLMs correctly translate problem descriptions into Prolog +representations, the corresponding reasoning proofs are ensured to be causal +and reliable. On two logical reasoning and one arithmetic reasoning datasets, +our framework obtains significant improvements in terms of both answer accuracy +and reasoning proof accuracy. Our code is released at +https://github.com/DAMO-NLP-SG/CaRing + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Unused information in token probability distribution of generative LLM: + improving LLM reading comprehension through calculation of expected values + + +
+ LLM text decoding is key component for perceived LLM quality. We demonstrate +two experiments showing that decoding methods could be improved by manipulation +of token probabilities. First, we test few LLM on SummEval summary scoring +dataset, to measure reading comprehension. We compare scores from greedy +decoding to expected values over the next token distribution. We scale logits +by large temperature to increase the entropy of scores. This allows strong +improvement of performance on SummEval (in terms of correlations to human +judgement). We see improvement from 6-8% to 13-28% for 7B Mistral and from +20%-46% to 37%-56% for Mixtral, beating GPT 4 0314 result on two metrics. Part +of the gain seems related to positional bias. Secondly, we use +probability-based tree sampling algorithm, to examine all most probable +generations for given prompt. + +
+
+ comment: 7 pages, 1 figure, presented at FEDCSIS 2024 conference, +
+
+
+
+
+ + ♻ ☆ How does Architecture Influence the Base Capabilities of Pre-trained + Language Models? A Case Study Based on FFN-Wider and MoE Transformers + + +
+ Pre-trained language models have been proven to possess strong base +capabilities, which not only excel in in-distribution language modeling but +also show powerful abilities in out-of-distribution language modeling, transfer +learning and few-shot learning. Unlike existing work focusing on the influence +of scale on base capabilities, our work examines the influence of architecture +on those. Specifically, our concern is: How does architecture influence the +base capabilities of pre-trained language models? In this work, we attempt to +explain and reverse the decline in base capabilities caused by the architecture +of FFN-Wider Transformers, seeking to provide some insights. Through analysis, +we found the contribution ratio of Multi-Head Attention (a combination +function) to pre-trained language modeling is a key factor affecting base +capabilities. FFN-Wider Transformers reduce the contribution ratio of this +combination function, leading to a decline in base capabilities. We confirmed +this by experiments and proposed Combination Enhanced Architecture (CEA) to +address the decline in base capabilities of such models. Significantly, we +extended our explanation and CEA to Mixture of Experts (MoE) Transformers. We +successfully achieved significant improvements in base capabilities on a 14B +parameter MoE model, demonstrating the practical application value of our work. +This also indicates that our analysis has a certain guiding significance for +architecture analysis, architecture improvement and architecture design. + +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ MPCODER: Multi-user Personalized Code Generator with Explicit and + Implicit Style Representation Learning ACL 2024 + + +
+ Large Language Models (LLMs) have demonstrated great potential for assisting +developers in their daily development. However, most research focuses on +generating correct code, how to use LLMs to generate personalized code has +seldom been investigated. To bridge this gap, we proposed MPCoder (Multi-user +Personalized Code Generator) to generate personalized code for multiple users. +To better learn coding style features, we utilize explicit coding style +residual learning to capture the syntax code style standards and implicit style +learning to capture the semantic code style conventions. We train a multi-user +style adapter to better differentiate the implicit feature representations of +different users through contrastive learning, ultimately enabling personalized +code generation for multiple users. We further propose a novel evaluation +metric for estimating similarities between codes of different coding styles. +The experimental results show the effectiveness of our approach for this novel +task. + +
+
+ comment: Accepted by ACL 2024, Main Conference +
+
+
+
+
+ + ♻ ☆ TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and + Multi-Level Style Control EMNLP 2024 + + +
+ Zero-shot singing voice synthesis (SVS) with style transfer and style control +aims to generate high-quality singing voices with unseen timbres and styles +(including singing method, emotion, rhythm, technique, and pronunciation) from +audio and text prompts. However, the multifaceted nature of singing styles +poses a significant challenge for effective modeling, transfer, and control. +Furthermore, current SVS models often fail to generate singing voices rich in +stylistic nuances for unseen singers. To address these challenges, we introduce +TCSinger, the first zero-shot SVS model for style transfer across cross-lingual +speech and singing styles, along with multi-level style control. Specifically, +TCSinger proposes three primary modules: 1) the clustering style encoder +employs a clustering vector quantization model to stably condense style +information into a compact latent space; 2) the Style and Duration Language +Model (S\&D-LM) concurrently predicts style information and phoneme duration, +which benefits both; 3) the style adaptive decoder uses a novel mel-style +adaptive normalization method to generate singing voices with enhanced details. +Experimental results show that TCSinger outperforms all baseline models in +synthesis quality, singer similarity, and style controllability across various +tasks, including zero-shot style transfer, multi-level style control, +cross-lingual style transfer, and speech-to-singing style transfer. Singing +voice samples can be accessed at https://tcsinger.github.io/. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Humans or LLMs as the Judge? A Study on Judgement Biases EMNLP2024 + + +
+ Adopting human and large language models (LLM) as judges (a.k.a human- and +LLM-as-a-judge) for evaluating the performance of LLMs has recently gained +attention. Nonetheless, this approach concurrently introduces potential biases +from human and LLMs, questioning the reliability of the evaluation results. In +this paper, we propose a novel framework that is free from referencing +groundtruth annotations for investigating Misinformation Oversight Bias, Gender +Bias, Authority Bias and Beauty Bias on LLM and human judges. We curate a +dataset referring to the revised Bloom's Taxonomy and conduct thousands of +evaluations. Results show that human and LLM judges are vulnerable to +perturbations to various degrees, and that even the cutting-edge judges possess +considerable biases. We further exploit these biases to conduct attacks on LLM +judges. We hope that our work can notify the community of the bias and +vulnerability of human- and LLM-as-a-judge, as well as the urgency of +developing robust evaluation systems. + +
+
+ comment: EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Can AI writing be salvaged? Mitigating Idiosyncrasies and Improving + Human-AI Alignment in the Writing Process through Edits + + +
+ LLM-based applications are helping people write, and LLM-generated text is +making its way into social media, journalism, and our classrooms. However, the +differences between LLM-generated and human-written text remain unclear. To +explore this, we hired professional writers to edit paragraphs in several +creative domains. We first found these writers agree on undesirable +idiosyncrasies in LLM-generated text, formalizing it into a seven-category +taxonomy (e.g. cliches, unnecessary exposition). Second, we curated the LAMP +corpus: 1,057 LLM-generated paragraphs edited by professional writers according +to our taxonomy. Analysis of LAMP reveals that none of the LLMs used in our +study (GPT4o, Claude-3.5-Sonnet, Llama-3.1-70b) outperform each other in terms +of writing quality, revealing common limitations across model families. Third, +we explored automatic editing methods to improve LLM-generated text. A +large-scale preference annotation confirms that although experts largely prefer +text edited by other experts, automatic editing methods show promise in +improving alignment between LLM-generated and human-written text. + +
+
+ comment: NLP+HCI, Behavioral Science +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Emojis in Texts via Self-supervised Graph + Pre-Training EMNLP 2024 + + +
+ Emojis have gained immense popularity on social platforms, serving as a +common means to supplement or replace text. However, existing data mining +approaches generally either completely ignore or simply treat emojis as +ordinary Unicode characters, which may limit the model's ability to grasp the +rich semantic information in emojis and the interaction between emojis and +texts. Thus, it is necessary to release the emoji's power in social media data +mining. To this end, we first construct a heterogeneous graph consisting of +three types of nodes, i.e. post, word and emoji nodes to improve the +representation of different elements in posts. The edges are also well-defined +to model how these three elements interact with each other. To facilitate the +sharing of information among post, word and emoji nodes, we propose a graph +pre-train framework for text and emoji co-modeling, which contains two graph +pre-training tasks: node-level graph contrastive learning and edge-level link +reconstruction learning. Extensive experiments on the Xiaohongshu and Twitter +datasets with two types of downstream tasks demonstrate that our approach +proves significant improvement over previous strong baseline methods. + +
+
+ comment: Accepted by EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ LAViTeR: Learning Aligned Visual and Textual Representations Assisted by + Image and Caption Generation + + +
+ Pre-training visual and textual representations from large-scale image-text +pairs is becoming a standard approach for many downstream vision-language +tasks. The transformer-based models learn inter and intra-modal attention +through a list of self-supervised learning tasks. This paper proposes LAViTeR, +a novel architecture for visual and textual representation learning. The main +module, Visual Textual Alignment (VTA) will be assisted by two auxiliary tasks, +GAN-based image synthesis and Image Captioning. We also propose a new +evaluation metric measuring the similarity between the learnt visual and +textual embedding. The experimental results on two public datasets, CUB and +MS-COCO, demonstrate superior visual and textual representation alignment in +the joint feature embedding space + +
+
+ comment: 15 pages, 10 Figures, 5 Tables. Oral Presentation at Irish Machine + Vision and Image Processing Conference Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ View From Above: A Framework for Evaluating Distribution Shifts in Model + Behavior + + +
+ When large language models (LLMs) are asked to perform certain tasks, how can +we be sure that their learned representations align with reality? We propose a +domain-agnostic framework for systematically evaluating distribution shifts in +LLMs decision-making processes, where they are given control of mechanisms +governed by pre-defined rules. While individual LLM actions may appear +consistent with expected behavior, across a large number of trials, +statistically significant distribution shifts can emerge. To test this, we +construct a well-defined environment with known outcome logic: blackjack. In +more than 1,000 trials, we uncover statistically significant evidence +suggesting behavioral misalignment in the learned representations of LLM. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ FlowTurbo: Towards Real-time Flow-Based Image Generation with Velocity + Refiner NeurIPS 2024 + + +
+ Building on the success of diffusion models in visual generation, flow-based +models reemerge as another prominent family of generative models that have +achieved competitive or better performance in terms of both visual quality and +inference speed. By learning the velocity field through flow-matching, +flow-based models tend to produce a straighter sampling trajectory, which is +advantageous during the sampling process. However, unlike diffusion models for +which fast samplers are well-developed, efficient sampling of flow-based +generative models has been rarely explored. In this paper, we propose a +framework called FlowTurbo to accelerate the sampling of flow-based models +while still enhancing the sampling quality. Our primary observation is that the +velocity predictor's outputs in the flow-based models will become stable during +the sampling, enabling the estimation of velocity via a lightweight velocity +refiner. Additionally, we introduce several techniques including a pseudo +corrector and sample-aware compilation to further reduce inference time. Since +FlowTurbo does not change the multi-step sampling paradigm, it can be +effectively applied for various tasks such as image editing, inpainting, etc. +By integrating FlowTurbo into different flow-based models, we obtain an +acceleration ratio of 53.1%$\sim$58.3% on class-conditional generation and +29.8%$\sim$38.5% on text-to-image generation. Notably, FlowTurbo reaches an FID +of 2.12 on ImageNet with 100 (ms / img) and FID of 3.93 with 38 (ms / img), +achieving the real-time image generation and establishing the new +state-of-the-art. Code is available at https://github.com/shiml20/FlowTurbo. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ EgoLM: Multi-Modal Language Model of Egocentric Motions + + +
+ As the prevalence of wearable devices, learning egocentric motions becomes +essential to develop contextual AI. In this work, we present EgoLM, a versatile +framework that tracks and understands egocentric motions from multi-modal +inputs, e.g., egocentric videos and motion sensors. EgoLM exploits rich +contexts for the disambiguation of egomotion tracking and understanding, which +are ill-posed under single modality conditions. To facilitate the versatile and +multi-modal framework, our key insight is to model the joint distribution of +egocentric motions and natural languages using large language models (LLM). +Multi-modal sensor inputs are encoded and projected to the joint latent space +of language models, and used to prompt motion generation or text generation for +egomotion tracking or understanding, respectively. Extensive experiments on +large-scale multi-modal human motion dataset validate the effectiveness of +EgoLM as a generalist model for universal egocentric learning. + +
+
+ comment: Project Page: https://hongfz16.github.io/projects/EgoLM +
+
+
+
+
+ + ☆ LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with + 3D-awareness + + +
+ Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced +their proficiency in 2D visual understanding tasks, enabling them to +effectively process and understand images and videos. However, the development +of LMMs with 3D-awareness for 3D scene understanding has been hindered by the +lack of large-scale 3D vision-language datasets and powerful 3D encoders. In +this paper, we introduce a simple yet effective framework called LLaVA-3D. +Leveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D +efficiently adapts LLaVA for 3D scene understanding without compromising 2D +understanding capabilities. To achieve this, we employ a simple yet effective +representation, 3D Patch, which connects 2D CLIP patch features with their +corresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs +and employing joint 2D and 3D vision-language instruction tuning, we establish +a unified architecture for both 2D image understanding and 3D scene +understanding. Experimental results show that LLaVA-3D converges 3.5x faster +than existing 3D LMMs when trained on 3D vision-language datasets. Moreover, +LLaVA-3D not only achieves state-of-the-art performance across various 3D tasks +but also maintains comparable 2D image understanding and vision-language +conversation capabilities with LLaVA. + +
+
+ comment: Project page: https://zcmax.github.io/projects/LLaVA-3D/ +
+
+
+
+
+ + ☆ Lotus: Diffusion-based Visual Foundation Model for High-quality Dense + Prediction + + +
+ Leveraging the visual priors of pre-trained text-to-image diffusion models +offers a promising solution to enhance zero-shot generalization in dense +prediction tasks. However, existing methods often uncritically use the original +diffusion formulation, which may not be optimal due to the fundamental +differences between dense prediction and image generation. In this paper, we +provide a systemic analysis of the diffusion formulation for the dense +prediction, focusing on both quality and efficiency. And we find that the +original parameterization type for image generation, which learns to predict +noise, is harmful for dense prediction; the multi-step noising/denoising +diffusion process is also unnecessary and challenging to optimize. Based on +these insights, we introduce Lotus, a diffusion-based visual foundation model +with a simple yet effective adaptation protocol for dense prediction. +Specifically, Lotus is trained to directly predict annotations instead of +noise, thereby avoiding harmful variance. We also reformulate the diffusion +process into a single-step procedure, simplifying optimization and +significantly boosting inference speed. Additionally, we introduce a novel +tuning strategy called detail preserver, which achieves more accurate and +fine-grained predictions. Without scaling up the training data or model +capacity, Lotus achieves SoTA performance in zero-shot depth and normal +estimation across various datasets. It also significantly enhances efficiency, +being hundreds of times faster than most existing diffusion-based methods. + +
+
+ comment: Project page: https://lotus3d.github.io/ +
+
+
+
+
+ + ☆ Robot See Robot Do: Imitating Articulated Object Manipulation with + Monocular 4D Reconstruction + + +
+ Humans can learn to manipulate new objects by simply watching others; +providing robots with the ability to learn from such demonstrations would +enable a natural interface specifying new behaviors. This work develops Robot +See Robot Do (RSRD), a method for imitating articulated object manipulation +from a single monocular RGB human demonstration given a single static +multi-view object scan. We first propose 4D Differentiable Part Models +(4D-DPM), a method for recovering 3D part motion from a monocular video with +differentiable rendering. This analysis-by-synthesis approach uses part-centric +feature fields in an iterative optimization which enables the use of geometric +regularizers to recover 3D motions from only a single video. Given this 4D +reconstruction, the robot replicates object trajectories by planning bimanual +arm motions that induce the demonstrated object part motion. By representing +demonstrations as part-centric trajectories, RSRD focuses on replicating the +demonstration's intended behavior while considering the robot's own +morphological limits, rather than attempting to reproduce the hand's motion. We +evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part +trajectories and RSRD's physical execution performance on 9 objects across 10 +trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of +87% success rate, for a total end-to-end success rate of 60% across 90 trials. +Notably, this is accomplished using only feature fields distilled from large +pretrained vision models -- without any task-specific training, fine-tuning, +dataset collection, or annotation. Project page: +https://robot-see-robot-do.github.io + +
+
+ comment: CoRL 2024, Project page: https://robot-see-robot-do.github.io +
+
+
+
+
+ + ☆ EvMAPPER: High Altitude Orthomapping with Event Cameras + + +
+ Traditionally, unmanned aerial vehicles (UAVs) rely on CMOS-based cameras to +collect images about the world below. One of the most successful applications +of UAVs is to generate orthomosaics or orthomaps, in which a series of images +are integrated together to develop a larger map. However, the use of CMOS-based +cameras with global or rolling shutters mean that orthomaps are vulnerable to +challenging light conditions, motion blur, and high-speed motion of +independently moving objects under the camera. Event cameras are less sensitive +to these issues, as their pixels are able to trigger asynchronously on +brightness changes. This work introduces the first orthomosaic approach using +event cameras. In contrast to existing methods relying only on CMOS cameras, +our approach enables map generation even in challenging light conditions, +including direct sunlight and after sunset. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ EdgeRunner: Auto-regressive Auto-encoder for Artistic Mesh Generation + + +
+ Current auto-regressive mesh generation methods suffer from issues such as +incompleteness, insufficient detail, and poor generalization. In this paper, we +propose an Auto-regressive Auto-encoder (ArAE) model capable of generating +high-quality 3D meshes with up to 4,000 faces at a spatial resolution of +$512^3$. We introduce a novel mesh tokenization algorithm that efficiently +compresses triangular meshes into 1D token sequences, significantly enhancing +training efficiency. Furthermore, our model compresses variable-length +triangular meshes into a fixed-length latent space, enabling training latent +diffusion models for better generalization. Extensive experiments demonstrate +the superior quality, diversity, and generalization capabilities of our model +in both point cloud and image-conditioned mesh generation tasks. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/dir/edgerunner/ +
+
+
+
+
+ + ☆ E.T. Bench: Towards Open-Ended Event-Level Video-Language Understanding NeurIPS 2024 + + +
+ Recent advances in Video Large Language Models (Video-LLMs) have demonstrated +their great potential in general-purpose video understanding. To verify the +significance of these models, a number of benchmarks have been proposed to +diagnose their capabilities in different scenarios. However, existing +benchmarks merely evaluate models through video-level question-answering, +lacking fine-grained event-level assessment and task diversity. To fill this +gap, we introduce E.T. Bench (Event-Level & Time-Sensitive Video Understanding +Benchmark), a large-scale and high-quality benchmark for open-ended event-level +video understanding. Categorized within a 3-level task taxonomy, E.T. Bench +encompasses 7.3K samples under 12 tasks with 7K videos (251.4h total length) +under 8 domains, providing comprehensive evaluations. We extensively evaluated +8 Image-LLMs and 12 Video-LLMs on our benchmark, and the results reveal that +state-of-the-art models for coarse-level (video-level) understanding struggle +to solve our fine-grained tasks, e.g., grounding event-of-interests within +videos, largely due to the short video context length, improper time +representations, and lack of multi-event training data. Focusing on these +issues, we further propose a strong baseline model, E.T. Chat, together with an +instruction-tuning dataset E.T. Instruct 164K tailored for fine-grained +event-level understanding. Our simple but effective solution demonstrates +superior performance in multiple scenarios. + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ AI-Powered Augmented Reality for Satellite Assembly, Integration and + Test + + +
+ The integration of Artificial Intelligence (AI) and Augmented Reality (AR) is +set to transform satellite Assembly, Integration, and Testing (AIT) processes +by enhancing precision, minimizing human error, and improving operational +efficiency in cleanroom environments. This paper presents a technical +description of the European Space Agency's (ESA) project "AI for AR in +Satellite AIT," which combines real-time computer vision and AR systems to +assist technicians during satellite assembly. Leveraging Microsoft HoloLens 2 +as the AR interface, the system delivers context-aware instructions and +real-time feedback, tackling the complexities of object recognition and 6D pose +estimation in AIT workflows. All AI models demonstrated over 70% accuracy, with +the detection model exceeding 95% accuracy, indicating a high level of +performance and reliability. A key contribution of this work lies in the +effective use of synthetic data for training AI models in AR applications, +addressing the significant challenges of obtaining real-world datasets in +highly dynamic satellite environments, as well as the creation of the Segmented +Anything Model for Automatic Labelling (SAMAL), which facilitates the automatic +annotation of real data, achieving speeds up to 20 times faster than manual +human annotation. The findings demonstrate the efficacy of AI-driven AR systems +in automating critical satellite assembly tasks, setting a foundation for +future innovations in the space industry. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ EfficientCrackNet: A Lightweight Model for Crack Segmentation + + +
+ Crack detection, particularly from pavement images, presents a formidable +challenge in the domain of computer vision due to several inherent complexities +such as intensity inhomogeneity, intricate topologies, low contrast, and noisy +backgrounds. Automated crack detection is crucial for maintaining the +structural integrity of essential infrastructures, including buildings, +pavements, and bridges. Existing lightweight methods often face challenges +including computational inefficiency, complex crack patterns, and difficult +backgrounds, leading to inaccurate detection and impracticality for real-world +applications. To address these limitations, we propose EfficientCrackNet, a +lightweight hybrid model combining Convolutional Neural Networks (CNNs) and +transformers for precise crack segmentation. EfficientCrackNet integrates +depthwise separable convolutions (DSC) layers and MobileViT block to capture +both global and local features. The model employs an Edge Extraction Method +(EEM) and for efficient crack edge detection without pretraining, and +Ultra-Lightweight Subspace Attention Module (ULSAM) to enhance feature +extraction. Extensive experiments on three benchmark datasets Crack500, +DeepCrack, and GAPs384 demonstrate that EfficientCrackNet achieves superior +performance compared to existing lightweight models, while requiring only 0.26M +parameters, and 0.483 FLOPs (G). The proposed model offers an optimal balance +between accuracy and computational efficiency, outperforming state-of-the-art +lightweight models, and providing a robust and adaptable solution for +real-world crack segmentation. + +
+
+
+
+
+ + ☆ DiffSSC: Semantic LiDAR Scan Completion using Denoising Diffusion + Probabilistic Models + + +
+ Perception systems play a crucial role in autonomous driving, incorporating +multiple sensors and corresponding computer vision algorithms. 3D LiDAR sensors +are widely used to capture sparse point clouds of the vehicle's surroundings. +However, such systems struggle to perceive occluded areas and gaps in the scene +due to the sparsity of these point clouds and their lack of semantics. To +address these challenges, Semantic Scene Completion (SSC) jointly predicts +unobserved geometry and semantics in the scene given raw LiDAR measurements, +aiming for a more complete scene representation. Building on promising results +of diffusion models in image generation and super-resolution tasks, we propose +their extension to SSC by implementing the noising and denoising diffusion +processes in the point and semantic spaces individually. To control the +generation, we employ semantic LiDAR point clouds as conditional input and +design local and global regularization losses to stabilize the denoising +process. We evaluate our approach on autonomous driving datasets and our +approach outperforms the state-of-the-art for SSC. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Stable Video Portraits ECCV 2024 + + +
+ Rapid advances in the field of generative AI and text-to-image methods in +particular have transformed the way we interact with and perceive +computer-generated imagery today. In parallel, much progress has been made in +3D face reconstruction, using 3D Morphable Models (3DMM). In this paper, we +present SVP, a novel hybrid 2D/3D generation method that outputs photorealistic +videos of talking faces leveraging a large pre-trained text-to-image prior +(2D), controlled via a 3DMM (3D). Specifically, we introduce a person-specific +fine-tuning of a general 2D stable diffusion model which we lift to a video +model by providing temporal 3DMM sequences as conditioning and by introducing a +temporal denoising procedure. As an output, this model generates temporally +smooth imagery of a person with 3DMM-based controls, i.e., a person-specific +avatar. The facial appearance of this person-specific avatar can be edited and +morphed to text-defined celebrities, without any fine-tuning at test time. The +method is analyzed quantitatively and qualitatively, and we show that our +method outperforms state-of-the-art monocular head avatar methods. + +
+
+ comment: Accepted at ECCV 2024, Project: https://svp.is.tue.mpg.de +
+
+
+
+
+ + ☆ SKT: Integrating State-Aware Keypoint Trajectories with Vision-Language + Models for Robotic Garment Manipulation + + +
+ Automating garment manipulation poses a significant challenge for assistive +robotics due to the diverse and deformable nature of garments. Traditional +approaches typically require separate models for each garment type, which +limits scalability and adaptability. In contrast, this paper presents a unified +approach using vision-language models (VLMs) to improve keypoint prediction +across various garment categories. By interpreting both visual and semantic +information, our model enables robots to manage different garment states with a +single model. We created a large-scale synthetic dataset using advanced +simulation techniques, allowing scalable training without extensive real-world +data. Experimental results indicate that the VLM-based method significantly +enhances keypoint detection accuracy and task success rates, providing a more +flexible and general solution for robotic garment manipulation. In addition, +this research also underscores the potential of VLMs to unify various garment +manipulation tasks within a single framework, paving the way for broader +applications in home automation and assistive robotics for future. + +
+
+
+
+
+ + ☆ FreeEdit: Mask-free Reference-based Image Editing with Multi-modal + Instruction + + +
+ Introducing user-specified visual concepts in image editing is highly +practical as these concepts convey the user's intent more precisely than +text-based descriptions. We propose FreeEdit, a novel approach for achieving +such reference-based image editing, which can accurately reproduce the visual +concept from the reference image based on user-friendly language instructions. +Our approach leverages the multi-modal instruction encoder to encode language +instructions to guide the editing process. This implicit way of locating the +editing area eliminates the need for manual editing masks. To enhance the +reconstruction of reference details, we introduce the Decoupled Residual +ReferAttention (DRRA) module. This module is designed to integrate fine-grained +reference features extracted by a detail extractor into the image editing +process in a residual way without interfering with the original self-attention. +Given that existing datasets are unsuitable for reference-based image editing +tasks, particularly due to the difficulty in constructing image triplets that +include a reference image, we curate a high-quality dataset, FreeBench, using a +newly developed twice-repainting scheme. FreeBench comprises the images before +and after editing, detailed editing instructions, as well as a reference image +that maintains the identity of the edited object, encompassing tasks such as +object addition, replacement, and deletion. By conducting phased training on +FreeBench followed by quality tuning, FreeEdit achieves high-quality zero-shot +editing through convenient language instructions. We conduct extensive +experiments to evaluate the effectiveness of FreeEdit across multiple task +types, demonstrating its superiority over existing methods. The code will be +available at: https://freeedit.github.io/. + +
+
+ comment: 14 pages, 14 figures, project website: https://freeedit.github.io/ +
+
+
+
+
+ + ☆ LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field ECCV'24 + + +
+ Recent works have shown that neural radiance fields (NeRFs) on top of +parametric models have reached SOTA quality to build photorealistic head +avatars from a monocular video. However, one major limitation of the NeRF-based +avatars is the slow rendering speed due to the dense point sampling of NeRF, +preventing them from broader utility on resource-constrained devices. We +introduce LightAvatar, the first head avatar model based on neural light fields +(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose +via a single network forward pass, without using mesh or volume rendering. The +proposed approach, while being conceptually appealing, poses a significant +challenge towards real-time efficiency and training stability. To resolve them, +we introduce dedicated network designs to obtain proper representations for the +NeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a +distillation-based training strategy that uses a pretrained avatar model as +teacher to synthesize abundant pseudo data for training. A warping field +network is introduced to correct the fitting error in the real data so that the +model can learn better. Extensive experiments suggest that our method can +achieve new SOTA image quality quantitatively or qualitatively, while being +significantly faster than the counterparts, reporting 174.1 FPS (512x512 +resolution) on a consumer-grade GPU (RTX3090) with no customized optimization. + +
+
+ comment: Appear in ECCV'24 CADL Workshop. Code: + https://github.com/MingSun-Tse/LightAvatar-TensorFlow +
+
+
+
+
+ + ☆ Visual Data Diagnosis and Debiasing with Concept Graphs + + +
+ The widespread success of deep learning models today is owed to the curation +of extensive datasets significant in size and complexity. However, such models +frequently pick up inherent biases in the data during the training process, +leading to unreliable predictions. Diagnosing and debiasing datasets is thus a +necessity to ensure reliable model performance. In this paper, we present +CONBIAS, a novel framework for diagnosing and mitigating Concept co-occurrence +Biases in visual datasets. CONBIAS represents visual datasets as knowledge +graphs of concepts, enabling meticulous analysis of spurious concept +co-occurrences to uncover concept imbalances across the whole dataset. +Moreover, we show that by employing a novel clique-based concept balancing +strategy, we can mitigate these imbalances, leading to enhanced performance on +downstream tasks. Extensive experiments show that data augmentation based on a +balanced concept distribution augmented by CONBIAS improves generalization +performance across multiple datasets compared to state-of-the-art methods. We +will make our code and data publicly available. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ EMOVA: Empowering Language Models to See, Hear and Speak with Vivid + Emotions + + +
+ GPT-4o, an omni-modal model that enables vocal conversations with diverse +emotions and tones, marks a milestone for omni-modal foundation models. +However, empowering Large Language Models to perceive and generate images, +texts, and speeches end-to-end with publicly available data remains challenging +in the open-source community. Existing vision-language models rely on external +tools for the speech processing, while speech-language models still suffer from +limited or even without vision-understanding abilities. To address this gap, we +propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large +Language Models with end-to-end speech capabilities while maintaining the +leading vision-language performance. With a semantic-acoustic disentangled +speech tokenizer, we notice surprisingly that omni-modal alignment can further +enhance vision-language and speech abilities compared with the corresponding +bi-modal aligned counterparts. Moreover, a lightweight style module is proposed +for flexible speech style controls (e.g., emotions and pitches). For the first +time, EMOVA achieves state-of-the-art performance on both the vision-language +and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue +with vivid emotions. + +
+
+ comment: Project Page: https://emova-ollm.github.io/ +
+
+
+
+
+ + ☆ ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty + Learning + + +
+ Vision-centric semantic occupancy prediction plays a crucial role in +autonomous driving, which requires accurate and reliable predictions from +low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, +there is still few research effort to explore the reliability in predicting +semantic occupancy from camera. In this paper, we conduct a comprehensive +evaluation of existing semantic occupancy prediction models from a reliability +perspective for the first time. Despite the gradual alignment of camera-based +models with LiDAR in term of accuracy, a significant reliability gap persists. +To addresses this concern, we propose ReliOcc, a method designed to enhance the +reliability of camera-based occupancy networks. ReliOcc provides a +plug-and-play scheme for existing models, which integrates hybrid uncertainty +from individual voxels with sampling-based noise and relative voxels through +mix-up learning. Besides, an uncertainty-aware calibration strategy is devised +to further enhance model reliability in offline mode. Extensive experiments +under various settings demonstrate that ReliOcc significantly enhances model +reliability while maintaining the accuracy of both geometric and semantic +predictions. Importantly, our proposed approach exhibits robustness to sensor +failures and out of domain noises during inference. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ InterNet: Unsupervised Cross-modal Homography Estimation Based on + Interleaved Modality Transfer and Self-supervised Homography Prediction + + +
+ We propose a novel unsupervised cross-modal homography estimation framework, +based on interleaved modality transfer and self-supervised homography +prediction, named InterNet. InterNet integrates modality transfer and +self-supervised homography estimation, introducing an innovative interleaved +optimization framework to alternately promote both components. The modality +transfer gradually narrows the modality gaps, facilitating the self-supervised +homography estimation to fully leverage the synthetic intra-modal data. The +self-supervised homography estimation progressively achieves reliable +predictions, thereby providing robust cross-modal supervision for the modality +transfer. To further boost the estimation accuracy, we also formulate a +fine-grained homography feature loss to improve the connection between two +components. Furthermore, we employ a simple yet effective distillation training +technique to reduce model parameters and improve cross-domain generalization +ability while maintaining comparable performance. Experiments reveal that +InterNet achieves the state-of-the-art (SOTA) performance among unsupervised +methods, and even outperforms many supervised methods such as MHN and +LocalTrans. + +
+
+
+
+
+ + ☆ Deblur e-NeRF: NeRF from Motion-Blurred Events under High-speed or + Low-light Conditions ECCV 2024 + + +
+ The stark contrast in the design philosophy of an event camera makes it +particularly ideal for operating under high-speed, high dynamic range and +low-light conditions, where standard cameras underperform. Nonetheless, event +cameras still suffer from some amount of motion blur, especially under these +challenging conditions, in contrary to what most think. This is attributed to +the limited bandwidth of the event sensor pixel, which is mostly proportional +to the light intensity. Thus, to ensure that event cameras can truly excel in +such conditions where it has an edge over standard cameras, it is crucial to +account for event motion blur in downstream applications, especially +reconstruction. However, none of the recent works on reconstructing Neural +Radiance Fields (NeRFs) from events, nor event simulators, have considered the +full effects of event motion blur. To this end, we propose, Deblur e-NeRF, a +novel method to directly and effectively reconstruct blur-minimal NeRFs from +motion-blurred events generated under high-speed motion or low-light +conditions. The core component of this work is a physically-accurate pixel +bandwidth model proposed to account for event motion blur under arbitrary speed +and lighting conditions. We also introduce a novel threshold-normalized total +variation loss to improve the regularization of large textureless patches. +Experiments on real and novel realistically simulated sequences verify our +effectiveness. Our code, event simulator and synthetic event dataset will be +open-sourced. + +
+
+ comment: Accepted to ECCV 2024. Project website is accessible at + https://wengflow.github.io/deblur-e-nerf. arXiv admin note: text overlap with + arXiv:2006.07722 by other authors +
+
+
+
+
+ + ☆ LLM4Brain: Training a Large Language Model for Brain Video Understanding ECCV2024 + + +
+ Decoding visual-semantic information from brain signals, such as functional +MRI (fMRI), across different subjects poses significant challenges, including +low signal-to-noise ratio, limited data availability, and cross-subject +variability. Recent advancements in large language models (LLMs) show +remarkable effectiveness in processing multimodal information. In this study, +we introduce an LLM-based approach for reconstructing visual-semantic +information from fMRI signals elicited by video stimuli. Specifically, we +employ fine-tuning techniques on an fMRI encoder equipped with adaptors to +transform brain responses into latent representations aligned with the video +stimuli. Subsequently, these representations are mapped to textual modality by +LLM. In particular, we integrate self-supervised domain adaptation methods to +enhance the alignment between visual-semantic information and brain responses. +Our proposed method achieves good results using various quantitative semantic +metrics, while yielding similarity with ground-truth information. + +
+
+ comment: ECCV2024 Workshop +
+
+
+
+
+ + ☆ BlinkTrack: Feature Tracking over 100 FPS via Events and Images + + +
+ Feature tracking is crucial for, structure from motion (SFM), simultaneous +localization and mapping (SLAM), object tracking and various computer vision +tasks. Event cameras, known for their high temporal resolution and ability to +capture asynchronous changes, have gained significant attention for their +potential in feature tracking, especially in challenging conditions. However, +event cameras lack the fine-grained texture information that conventional +cameras provide, leading to error accumulation in tracking. To address this, we +propose a novel framework, BlinkTrack, which integrates event data with RGB +images for high-frequency feature tracking. Our method extends the traditional +Kalman filter into a learning-based framework, utilizing differentiable Kalman +filters in both event and image branches. This approach improves +single-modality tracking, resolves ambiguities, and supports asynchronous data +fusion. We also introduce new synthetic and augmented datasets to better +evaluate our model. Experimental results indicate that BlinkTrack significantly +outperforms existing event-based methods, exceeding 100 FPS with preprocessed +event data and 80 FPS with multi-modality data. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ Cross-Modality Attack Boosted by Gradient-Evolutionary Multiform + Optimization + + +
+ In recent years, despite significant advancements in adversarial attack +research, the security challenges in cross-modal scenarios, such as the +transferability of adversarial attacks between infrared, thermal, and RGB +images, have been overlooked. These heterogeneous image modalities collected by +different hardware devices are widely prevalent in practical applications, and +the substantial differences between modalities pose significant challenges to +attack transferability. In this work, we explore a novel cross-modal +adversarial attack strategy, termed multiform attack. We propose a dual-layer +optimization framework based on gradient-evolution, facilitating efficient +perturbation transfer between modalities. In the first layer of optimization, +the framework utilizes image gradients to learn universal perturbations within +each modality and employs evolutionary algorithms to search for shared +perturbations with transferability across different modalities through +secondary optimization. Through extensive testing on multiple heterogeneous +datasets, we demonstrate the superiority and robustness of Multiform Attack +compared to existing techniques. This work not only enhances the +transferability of cross-modal adversarial attacks but also provides a new +perspective for understanding security vulnerabilities in cross-modal systems. + +
+
+
+
+
+ + ☆ CNCA: Toward Customizable and Natural Generation of Adversarial + Camouflage for Vehicle Detectors + + +
+ Prior works on physical adversarial camouflage against vehicle detectors +mainly focus on the effectiveness and robustness of the attack. The current +most successful methods optimize 3D vehicle texture at a pixel level. However, +this results in conspicuous and attention-grabbing patterns in the generated +camouflage, which humans can easily identify. To address this issue, we propose +a Customizable and Natural Camouflage Attack (CNCA) method by leveraging an +off-the-shelf pre-trained diffusion model. By sampling the optimal texture +image from the diffusion model with a user-specific text prompt, our method can +generate natural and customizable adversarial camouflage while maintaining high +attack performance. With extensive experiments on the digital and physical +worlds and user studies, the results demonstrate that our proposed method can +generate significantly more natural-looking camouflage than the +state-of-the-art baselines while achieving competitive attack performance. Our +code is available at +\href{https://anonymous.4open.science/r/CNCA-1D54}{https://anonymous.4open.science/r/CNCA-1D54} + +
+
+
+
+
+ + ☆ The Hard Positive Truth about Vision-Language Compositionality ECCV 2024 + + +
+ Several benchmarks have concluded that our best vision-language models (e.g., +CLIP) are lacking in compositionality. Given an image, these benchmarks probe a +model's ability to identify its associated caption amongst a set of +compositional distractors. In response, a surge of recent proposals show +improvements by finetuning CLIP with distractors as hard negatives. Our +investigations reveal that these improvements have, in fact, been significantly +overstated -- because existing benchmarks do not probe whether finetuned +vision-language models remain invariant to hard positives. By curating an +evaluation dataset with 112,382 hard negatives and hard positives, we uncover +that including hard positives decreases CLIP's performance by 12.9%, while +humans perform effortlessly at 99%. CLIP finetuned with hard negatives results +in an even larger decrease, up to 38.7%. With this finding, we then produce a +1,775,259 image-text training set with both hard negative and hard positive +captions. By training with both, we see improvements on existing benchmarks +while simultaneously improving performance on hard positives, indicating a more +robust improvement in compositionality. Our work suggests the need for future +research to rigorously test and improve CLIP's understanding of semantic +relationships between related "positive" concepts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Spatial Hierarchy and Temporal Attention Guided Cross Masking for + Self-supervised Skeleton-based Action Recognition + + +
+ In self-supervised skeleton-based action recognition, the mask reconstruction +paradigm is gaining interest in enhancing model refinement and robustness +through effective masking. However, previous works primarily relied on a single +masking criterion, resulting in the model overfitting specific features and +overlooking other effective information. In this paper, we introduce a +hierarchy and attention guided cross-masking framework (HA-CM) that applies +masking to skeleton sequences from both spatial and temporal perspectives. +Specifically, in spatial graphs, we utilize hyperbolic space to maintain joint +distinctions and effectively preserve the hierarchical structure of +high-dimensional skeletons, employing joint hierarchy as the masking criterion. +In temporal flows, we substitute traditional distance metrics with the global +attention of joints for masking, addressing the convergence of distances in +high-dimensional space and the lack of a global perspective. Additionally, we +incorporate cross-contrast loss based on the cross-masking framework into the +loss function to enhance the model's learning of instance-level features. HA-CM +shows efficiency and universality on three public large-scale datasets, NTU-60, +NTU-120, and PKU-MMD. The source code of our HA-CM is available at +https://github.com/YinxPeng/HA-CM-main. + +
+
+ comment: 12 pages,6 figures,IEEE Trans +
+
+
+
+
+ + ☆ Perturb, Attend, Detect and Localize (PADL): Robust Proactive Image + Defense + + +
+ Image manipulation detection and localization have received considerable +attention from the research community given the blooming of Generative Models +(GMs). Detection methods that follow a passive approach may overfit to specific +GMs, limiting their application in real-world scenarios, due to the growing +diversity of generative models. Recently, approaches based on a proactive +framework have shown the possibility of dealing with this limitation. However, +these methods suffer from two main limitations, which raises concerns about +potential vulnerabilities: i) the manipulation detector is not robust to noise +and hence can be easily fooled; ii) the fact that they rely on fixed +perturbations for image protection offers a predictable exploit for malicious +attackers, enabling them to reverse-engineer and evade detection. To overcome +this issue we propose PADL, a new solution able to generate image-specific +perturbations using a symmetric scheme of encoding and decoding based on +cross-attention, which drastically reduces the possibility of reverse +engineering, even when evaluated with adaptive attack [31]. Additionally, PADL +is able to pinpoint manipulated areas, facilitating the identification of +specific regions that have undergone alterations, and has more generalization +power than prior art on held-out generative models. Indeed, although being +trained only on an attribute manipulation GAN model [15], our method +generalizes to a range of unseen models with diverse architectural designs, +such as StarGANv2, BlendGAN, DiffAE, StableDiffusion and StableDiffusionXL. +Additionally, we introduce a novel evaluation protocol, which offers a fair +evaluation of localisation performance in function of detection accuracy and +better captures real-world scenarios. + +
+
+
+
+
+ + ☆ Neural Light Spheres for Implicit Image Stitching and View Synthesis + + +
+ Challenging to capture, and challenging to display on a cellphone screen, the +panorama paradoxically remains both a staple and underused feature of modern +mobile camera applications. In this work we address both of these challenges +with a spherical neural light field model for implicit panoramic image +stitching and re-rendering; able to accommodate for depth parallax, +view-dependent lighting, and local scene motion and color changes during +capture. Fit during test-time to an arbitrary path panoramic video capture -- +vertical, horizontal, random-walk -- these neural light spheres jointly +estimate the camera path and a high-resolution scene reconstruction to produce +novel wide field-of-view projections of the environment. Our single-layer model +avoids expensive volumetric sampling, and decomposes the scene into compact +view-dependent ray offset and color components, with a total model size of 80 +MB per scene, and real-time (50 FPS) rendering at 1080p resolution. We +demonstrate improved reconstruction quality over traditional image stitching +and radiance field methods, with significantly higher tolerance to scene motion +and non-ideal capture settings. + +
+
+ comment: Project site: https://light.princeton.edu/publication/neuls/ +
+
+
+
+
+ + ☆ Resolving Multi-Condition Confusion for Finetuning-Free Personalized + Image Generation + + +
+ Personalized text-to-image generation methods can generate customized images +based on the reference images, which have garnered wide research interest. +Recent methods propose a finetuning-free approach with a decoupled +cross-attention mechanism to generate personalized images requiring no +test-time finetuning. However, when multiple reference images are provided, the +current decoupled cross-attention mechanism encounters the object confusion +problem and fails to map each reference image to its corresponding object, +thereby seriously limiting its scope of application. To address the object +confusion problem, in this work we investigate the relevance of different +positions of the latent image features to the target object in diffusion model, +and accordingly propose a weighted-merge method to merge multiple reference +image features into the corresponding objects. Next, we integrate this +weighted-merge method into existing pre-trained models and continue to train +the model on a multi-object dataset constructed from the open-sourced SA-1B +dataset. To mitigate object confusion and reduce training costs, we propose an +object quality score to estimate the image quality for the selection of +high-quality training samples. Furthermore, our weighted-merge training +framework can be employed on single-object generation when a single object has +multiple reference images. The experiments verify that our method achieves +superior performance to the state-of-the-arts on the Concept101 dataset and +DreamBooth dataset of multi-object personalized image generation, and +remarkably improves the performance on single-object personalized image +generation. Our code is available at https://github.com/hqhQAQ/MIP-Adapter. + +
+
+
+
+
+ + ☆ WaSt-3D: Wasserstein-2 Distance for Scene-to-Scene Stylization on 3D + Gaussians + + +
+ While style transfer techniques have been well-developed for 2D image +stylization, the extension of these methods to 3D scenes remains relatively +unexplored. Existing approaches demonstrate proficiency in transferring colors +and textures but often struggle with replicating the geometry of the scenes. In +our work, we leverage an explicit Gaussian Splatting (GS) representation and +directly match the distributions of Gaussians between style and content scenes +using the Earth Mover's Distance (EMD). By employing the entropy-regularized +Wasserstein-2 distance, we ensure that the transformation maintains spatial +smoothness. Additionally, we decompose the scene stylization problem into +smaller chunks to enhance efficiency. This paradigm shift reframes stylization +from a pure generative process driven by latent space losses to an explicit +matching of distributions between two Gaussian representations. Our method +achieves high-resolution 3D stylization by faithfully transferring details from +3D style scenes onto the content scene. Furthermore, WaSt-3D consistently +delivers results across diverse content and style scenes without necessitating +any training, as it relies solely on optimization-based techniques. See our +project page for additional results and source code: +$\href{https://compvis.github.io/wast3d/}{https://compvis.github.io/wast3d/}$. + +
+
+
+
+
+ + ☆ LKA-ReID:Vehicle Re-Identification with Large Kernel Attention ICASSP 2025 + + +
+ With the rapid development of intelligent transportation systems and the +popularity of smart city infrastructure, Vehicle Re-ID technology has become an +important research field. The vehicle Re-ID task faces an important challenge, +which is the high similarity between different vehicles. Existing methods use +additional detection or segmentation models to extract differentiated local +features. However, these methods either rely on additional annotations or +greatly increase the computational cost. Using attention mechanism to capture +global and local features is crucial to solve the challenge of high similarity +between classes in vehicle Re-ID tasks. In this paper, we propose LKA-ReID with +large kernel attention. Specifically, the large kernel attention (LKA) utilizes +the advantages of self-attention and also benefits from the advantages of +convolution, which can extract the global and local features of the vehicle +more comprehensively. We also introduce hybrid channel attention (HCA) combines +channel attention with spatial information, so that the model can better focus +on channels and feature regions, and ignore background and other disturbing +information. Experiments on VeRi-776 dataset demonstrated the effectiveness of +LKA-ReID, with mAP reaches 86.65% and Rank-1 reaches 98.03%. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation with Large Kernel Attention ICASSP 2025 + + +
+ Self-supervised monocular depth estimation has emerged as a promising +approach since it does not rely on labeled training data. Most methods combine +convolution and Transformer to model long-distance dependencies to estimate +depth accurately. However, Transformer treats 2D image features as 1D +sequences, and positional encoding somewhat mitigates the loss of spatial +information between different feature blocks, tending to overlook channel +features, which limit the performance of depth estimation. In this paper, we +propose a self-supervised monocular depth estimation network to get finer +details. Specifically, we propose a decoder based on large kernel attention, +which can model long-distance dependencies without compromising the +two-dimension structure of features while maintaining feature channel +adaptivity. In addition, we introduce a up-sampling module to accurately +recover the fine details in the depth map. Our method achieves competitive +results on the KITTI dataset. + +
+
+ comment: The paper is under consideration at 2025 IEEE International + Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ Upper-Body Pose-based Gaze Estimation for Privacy-Preserving 3D Gaze + Target Detection ECCV 2024 + + +
+ Gaze Target Detection (GTD), i.e., determining where a person is looking +within a scene from an external viewpoint, is a challenging task, particularly +in 3D space. Existing approaches heavily rely on analyzing the person's +appearance, primarily focusing on their face to predict the gaze target. This +paper presents a novel approach to tackle this problem by utilizing the +person's upper-body pose and available depth maps to extract a 3D gaze +direction and employing a multi-stage or an end-to-end pipeline to predict the +gazed target. When predicted accurately, the human body pose can provide +valuable information about the head pose, which is a good approximation of the +gaze direction, as well as the position of the arms and hands, which are linked +to the activity the person is performing and the objects they are likely +focusing on. Consequently, in addition to performing gaze estimation in 3D, we +are also able to perform GTD simultaneously. We demonstrate state-of-the-art +results on the most comprehensive publicly accessible 3D gaze target detection +dataset without requiring images of the person's face, thus promoting privacy +preservation in various application contexts. The code is available at +https://github.com/intelligolabs/privacy-gtd-3D. + +
+
+ comment: Accepted in the T-CAP workshop at ECCV 2024 +
+
+
+
+
+ + ☆ Self-Distilled Depth Refinement with Noisy Poisson Fusion NeurIPS 2024 + + +
+ Depth refinement aims to infer high-resolution depth with fine-grained edges +and details, refining low-resolution results of depth estimation models. The +prevailing methods adopt tile-based manners by merging numerous patches, which +lacks efficiency and produces inconsistency. Besides, prior arts suffer from +fuzzy depth boundaries and limited generalizability. Analyzing the fundamental +reasons for these limitations, we model depth refinement as a noisy Poisson +fusion problem with local inconsistency and edge deformation noises. We propose +the Self-distilled Depth Refinement (SDDR) framework to enforce robustness +against the noises, which mainly consists of depth edge representation and +edge-based guidance. With noisy depth predictions as input, SDDR generates +low-noise depth edge representations as pseudo-labels by coarse-to-fine +self-distillation. Edge-based guidance with edge-guided gradient loss and +edge-based fusion loss serves as the optimization objective equivalent to +Poisson fusion. When depth maps are better refined, the labels also become more +noise-free. Our model can acquire strong robustness to the noises, achieving +significant improvements in accuracy, edge quality, efficiency, and +generalizability on five different benchmarks. Moreover, directly training +another model with edge labels produced by SDDR brings improvements, suggesting +that our method could help with training robust refinement models in future +works. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Visualization of Age Distributions as Elements of Medical Data-Stories + + +
+ In various fields, including medicine, age distributions are crucial. Despite +widespread media coverage of health topics, there remains a need to enhance +health communication. Narrative medical visualization is promising for +improving information comprehension and retention. This study explores the most +effective ways to present age distributions of diseases through narrative +visualizations. We conducted a thorough analysis of existing visualizations, +held workshops with a broad audience, and reviewed relevant literature. From +this, we identified design choices focusing on comprehension, aesthetics, +engagement, and memorability. We specifically tested three pictogram variants: +pictograms as bars, stacked pictograms, and annotations. After evaluating 18 +visualizations with 72 participants and three expert reviews, we determined +that annotations were most effective for comprehension and aesthetics. However, +traditional bar charts were preferred for engagement, and other variants were +more memorable. The study provides a set of design recommendations based on +these insights. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ A New Dataset for Monocular Depth Estimation Under Viewpoint Shifts ECCV 2024 + + +
+ Monocular depth estimation is a critical task for autonomous driving and many +other computer vision applications. While significant progress has been made in +this field, the effects of viewpoint shifts on depth estimation models remain +largely underexplored. This paper introduces a novel dataset and evaluation +methodology to quantify the impact of different camera positions and +orientations on monocular depth estimation performance. We propose a ground +truth strategy based on homography estimation and object detection, eliminating +the need for expensive lidar sensors. We collect a diverse dataset of road +scenes from multiple viewpoints and use it to assess the robustness of a modern +depth estimation model to geometric shifts. After assessing the validity of our +strategy on a public dataset, we provide valuable insights into the limitations +of current models and highlight the importance of considering viewpoint +variations in real-world applications. + +
+
+ comment: 17 pages, 5 figures. Accepted at ECCV 2024 2nd Workshop on + Vision-Centric Autonomous Driving (VCAD) +
+
+
+
+
+ + ☆ Unsupervised Learning Based Multi-Scale Exposure Fusion + + +
+ Unsupervised learning based multi-scale exposure fusion (ULMEF) is efficient +for fusing differently exposed low dynamic range (LDR) images into a higher +quality LDR image for a high dynamic range (HDR) scene. Unlike supervised +learning, loss functions play a crucial role in the ULMEF. In this paper, novel +loss functions are proposed for the ULMEF and they are defined by using all the +images to be fused and other differently exposed images from the same HDR +scene. The proposed loss functions can guide the proposed ULMEF to learn more +reliable information from the HDR scene than existing loss functions which are +defined by only using the set of images to be fused. As such, the quality of +the fused image is significantly improved. The proposed ULMEF also adopts a +multi-scale strategy that includes a multi-scale attention module to +effectively preserve the scene depth and local contrast in the fused image. +Meanwhile, the proposed ULMEF can be adopted to achieve exposure interpolation +and exposure extrapolation. Extensive experiments show that the proposed ULMEF +algorithm outperforms state-of-the-art exposure fusion algorithms. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Kendall's $τ$ Coefficient for Logits Distillation + + +
+ Knowledge distillation typically employs the Kullback-Leibler (KL) divergence +to constrain the student model's output to match the soft labels provided by +the teacher model exactly. However, sometimes the optimization direction of the +KL divergence loss is not always aligned with the task loss, where a smaller KL +divergence could lead to erroneous predictions that diverge from the soft +labels. This limitation often results in suboptimal optimization for the +student. Moreover, even under temperature scaling, the KL divergence loss +function tends to overly focus on the larger-valued channels in the logits, +disregarding the rich inter-class information provided by the multitude of +smaller-valued channels. This hard constraint proves too challenging for +lightweight students, hindering further knowledge distillation. To address this +issue, we propose a plug-and-play ranking loss based on Kendall's $\tau$ +coefficient, called Rank-Kendall Knowledge Distillation (RKKD). RKKD balances +the attention to smaller-valued channels by constraining the order of channel +values in student logits, providing more inter-class relational information. +The rank constraint on the top-valued channels helps avoid suboptimal traps +during optimization. We also discuss different differentiable forms of +Kendall's $\tau$ coefficient and demonstrate that the proposed ranking loss +function shares a consistent optimization objective with the KL divergence. +Extensive experiments on the CIFAR-100 and ImageNet datasets show that our RKKD +can enhance the performance of various knowledge distillation baselines and +offer broad improvements across multiple teacher-student architecture +combinations. + +
+
+
+
+
+ + ☆ Cascade Prompt Learning for Vision-Language Model Adaptation ECCV2024 + + +
+ Prompt learning has surfaced as an effective approach to enhance the +performance of Vision-Language Models (VLMs) like CLIP when applied to +downstream tasks. However, current learnable prompt tokens are primarily used +for the single phase of adapting to tasks (i.e., adapting prompt), easily +leading to overfitting risks. In this work, we propose a novel Cascade Prompt +Learning CasPL framework to enable prompt learning to serve both generic and +specific expertise (i.e., boosting and adapting prompt) simultaneously. +Specifically, CasPL is a new learning paradigm comprising two distinct phases +of learnable prompts: the first boosting prompt is crafted to extract +domain-general knowledge from a senior larger CLIP teacher model by aligning +their predicted logits using extensive unlabeled domain images. The second +adapting prompt is then cascaded with the frozen first set to fine-tune the +downstream tasks, following the approaches employed in prior research. In this +manner, CasPL can effectively capture both domain-general and task-specific +representations into explicitly different gradual groups of prompts, thus +potentially alleviating overfitting issues in the target domain. It's worth +noting that CasPL serves as a plug-and-play module that can seamlessly +integrate into any existing prompt learning approach. CasPL achieves a +significantly better balance between performance and inference speed, which is +especially beneficial for deploying smaller VLM models in resource-constrained +environments. Compared to the previous state-of-the-art method PromptSRC, CasPL +shows an average improvement of 1.85% for base classes, 3.44% for novel +classes, and 2.72% for the harmonic mean over 11 image classification datasets. +Code is publicly available at: https://github.com/megvii-research/CasPL. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ Reblurring-Guided Single Image Defocus Deblurring: A Learning Framework + with Misaligned Training Pairs + + +
+ For single image defocus deblurring, acquiring well-aligned training pairs +(or training triplets), i.e., a defocus blurry image, an all-in-focus sharp +image (and a defocus blur map), is an intricate task for the development of +deblurring models. Existing image defocus deblurring methods typically rely on +training data collected by specialized imaging equipment, presupposing that +these pairs or triplets are perfectly aligned. However, in practical scenarios +involving the collection of real-world data, direct acquisition of training +triplets is infeasible, and training pairs inevitably encounter spatial +misalignment issues. In this work, we introduce a reblurring-guided learning +framework for single image defocus deblurring, enabling the learning of a +deblurring network even with misaligned training pairs. Specifically, we first +propose a baseline defocus deblurring network that utilizes spatially varying +defocus blur map as degradation prior to enhance the deblurring performance. +Then, to effectively learn the baseline defocus deblurring network with +misaligned training pairs, our reblurring module ensures spatial consistency +between the deblurred image, the reblurred image and the input blurry image by +reconstructing spatially variant isotropic blur kernels. Moreover, the +spatially variant blur derived from the reblurring module can serve as pseudo +supervision for defocus blur map during training, interestingly transforming +training pairs into training triplets. Additionally, we have collected a new +dataset specifically for single image defocus deblurring (SDD) with typical +misalignments, which not only substantiates our proposed method but also serves +as a benchmark for future research. + +
+
+ comment: The source code and dataset are available at + https://github.com/ssscrystal/Reblurring-guided-JDRL +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Taming Diffusion Prior for Image Super-Resolution with Domain Shift SDEs NeurIPS 2024 + + +
+ Diffusion-based image super-resolution (SR) models have attracted substantial +interest due to their powerful image restoration capabilities. However, +prevailing diffusion models often struggle to strike an optimal balance between +efficiency and performance. Typically, they either neglect to exploit the +potential of existing extensive pretrained models, limiting their generative +capacity, or they necessitate a dozens of forward passes starting from random +noises, compromising inference efficiency. In this paper, we present DoSSR, a +Domain Shift diffusion-based SR model that capitalizes on the generative powers +of pretrained diffusion models while significantly enhancing efficiency by +initiating the diffusion process with low-resolution (LR) images. At the core +of our approach is a domain shift equation that integrates seamlessly with +existing diffusion models. This integration not only improves the use of +diffusion prior but also boosts inference efficiency. Moreover, we advance our +method by transitioning the discrete shift process to a continuous formulation, +termed as DoS-SDEs. This advancement leads to the fast and customized solvers +that further enhance sampling efficiency. Empirical results demonstrate that +our proposed method achieves state-of-the-art performance on synthetic and +real-world datasets, while notably requiring only 5 sampling steps. Compared to +previous diffusion prior based methods, our approach achieves a remarkable +speedup of 5-7 times, demonstrating its superior efficiency. Code: +https://github.com/QinpengCui/DoSSR. + +
+
+ comment: This paper is accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Harnessing Shared Relations via Multimodal Mixup Contrastive Learning + for Multimodal Classification + + +
+ Deep multimodal learning has shown remarkable success by leveraging +contrastive learning to capture explicit one-to-one relations across +modalities. However, real-world data often exhibits shared relations beyond +simple pairwise associations. We propose M3CoL, a Multimodal Mixup Contrastive +Learning approach to capture nuanced shared relations inherent in multimodal +data. Our key contribution is a Mixup-based contrastive loss that learns robust +representations by aligning mixed samples from one modality with their +corresponding samples from other modalities thereby capturing shared relations +between them. For multimodal classification tasks, we introduce a framework +that integrates a fusion module with unimodal prediction modules for auxiliary +supervision during training, complemented by our proposed Mixup-based +contrastive loss. Through extensive experiments on diverse datasets (N24News, +ROSMAP, BRCA, and Food-101), we demonstrate that M3CoL effectively captures +shared multimodal relations and generalizes across domains. It outperforms +state-of-the-art methods on N24News, ROSMAP, and BRCA, while achieving +comparable performance on Food-101. Our work highlights the significance of +learning shared relations for robust multimodal learning, opening up promising +avenues for future research. + +
+
+ comment: RK and RS contributed equally to this work, 20 Pages, 8 Figures, 9 + Tables +
+
+
+
+
+ + ☆ UNICORN: A Deep Learning Model for Integrating Multi-Stain Data in + Histopathology + + +
+ Background: The integration of multi-stain histopathology images through deep +learning poses a significant challenge in digital histopathology. Current +multi-modal approaches struggle with data heterogeneity and missing data. This +study aims to overcome these limitations by developing a novel transformer +model for multi-stain integration that can handle missing data during training +as well as inference. Methods: We propose UNICORN (UNiversal modality +Integration Network for CORonary classificatioN) a multi-modal transformer +capable of processing multi-stain histopathology for atherosclerosis severity +class prediction. The architecture comprises a two-stage, end-to-end trainable +model with specialized modules utilizing transformer self-attention blocks. The +initial stage employs domain-specific expert modules to extract features from +each modality. In the subsequent stage, an aggregation expert module integrates +these features by learning the interactions between the different data +modalities. Results: Evaluation was performed using a multi-class dataset of +atherosclerotic lesions from the Munich Cardiovascular Studies Biobank +(MISSION), using over 4,000 paired multi-stain whole slide images (WSIs) from +170 deceased individuals on 7 prespecified segments of the coronary tree, each +stained according to four histopathological protocols. UNICORN achieved a +classification accuracy of 0.67, outperforming other state-of-the-art models. +The model effectively identifies relevant tissue phenotypes across stainings +and implicitly models disease progression. Conclusion: Our proposed multi-modal +transformer model addresses key challenges in medical data analysis, including +data heterogeneity and missing modalities. Explainability and the model's +effectiveness in predicting atherosclerosis progression underscores its +potential for broader applications in medical research. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ LGFN: Lightweight Light Field Image Super-Resolution using Local + Convolution Modulation and Global Attention Feature Extraction + + +
+ Capturing different intensity and directions of light rays at the same scene +Light field (LF) can encode the 3D scene cues into a 4D LF image which has a +wide range of applications (i.e. post-capture refocusing and depth sensing). LF +image super-resolution (SR) aims to improve the image resolution limited by the +performance of LF camera sensor. Although existing methods have achieved +promising results the practical application of these models is limited because +they are not lightweight enough. In this paper we propose a lightweight model +named LGFN which integrates the local and global features of different views +and the features of different channels for LF image SR. Specifically owing to +neighboring regions of the same pixel position in different sub-aperture images +exhibit similar structural relationships we design a lightweight CNN-based +feature extraction module (namely DGCE) to extract local features better +through feature modulation. Meanwhile as the position beyond the boundaries in +the LF image presents a large disparity we propose an efficient spatial +attention module (namely ESAM) which uses decomposable large-kernel convolution +to obtain an enlarged receptive field and an efficient channel attention module +(namely ECAM). Compared with the existing LF image SR models with large +parameter our model has a parameter of 0.45M and a FLOPs of 19.33G which has +achieved a competitive effect. Extensive experiments with ablation studies +demonstrate the effectiveness of our proposed method which ranked the second +place in the Track 2 Fidelity & Efficiency of NTIRE2024 Light Field Super +Resolution Challenge and the seventh place in the Track 1 Fidelity. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Text Image Generation for Low-Resource Languages with Dual Translation + Learning + + +
+ Scene text recognition in low-resource languages frequently faces challenges +due to the limited availability of training datasets derived from real-world +scenes. This study proposes a novel approach that generates text images in +low-resource languages by emulating the style of real text images from +high-resource languages. Our approach utilizes a diffusion model that is +conditioned on binary states: ``synthetic'' and ``real.'' The training of this +model involves dual translation tasks, where it transforms plain text images +into either synthetic or real text images, based on the binary states. This +approach not only effectively differentiates between the two domains but also +facilitates the model's explicit recognition of characters in the target +language. Furthermore, to enhance the accuracy and variety of generated text +images, we introduce two guidance techniques: Fidelity-Diversity Balancing +Guidance and Fidelity Enhancement Guidance. Our experimental results +demonstrate that the text images generated by our proposed framework can +significantly improve the performance of scene text recognition models for +low-resource languages. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ AnyLogo: Symbiotic Subject-Driven Diffusion System with Gemini Status + + +
+ Diffusion models have made compelling progress on facilitating +high-throughput daily production. Nevertheless, the appealing customized +requirements are remain suffered from instance-level finetuning for authentic +fidelity. Prior zero-shot customization works achieve the semantic consistence +through the condensed injection of identity features, while addressing detailed +low-level signatures through complex model configurations and subject-specific +fabrications, which significantly break the statistical coherence within the +overall system and limit the applicability across various scenarios. To +facilitate the generic signature concentration with rectified efficiency, we +present \textbf{AnyLogo}, a zero-shot region customizer with remarkable detail +consistency, building upon the symbiotic diffusion system with eliminated +cumbersome designs. Streamlined as vanilla image generation, we discern that +the rigorous signature extraction and creative content generation are +promisingly compatible and can be systematically recycled within a single +denoising model. In place of the external configurations, the gemini status of +the denoising model promote the reinforced subject transmission efficiency and +disentangled semantic-signature space with continuous signature decoration. +Moreover, the sparse recycling paradigm is adopted to prevent the duplicated +risk with compressed transmission quota for diversified signature stimulation. +Extensive experiments on constructed logo-level benchmarks demonstrate the +effectiveness and practicability of our methods. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ Neural Implicit Representation for Highly Dynamic LiDAR Mapping and + Odometry + + +
+ Recent advancements in Simultaneous Localization and Mapping (SLAM) have +increasingly highlighted the robustness of LiDAR-based techniques. At the same +time, Neural Radiance Fields (NeRF) have introduced new possibilities for 3D +scene reconstruction, exemplified by SLAM systems. Among these, NeRF-LOAM has +shown notable performance in NeRF-based SLAM applications. However, despite its +strengths, these systems often encounter difficulties in dynamic outdoor +environments due to their inherent static assumptions. To address these +limitations, this paper proposes a novel method designed to improve +reconstruction in highly dynamic outdoor scenes. Based on NeRF-LOAM, the +proposed approach consists of two primary components. First, we separate the +scene into static background and dynamic foreground. By identifying and +excluding dynamic elements from the mapping process, this segmentation enables +the creation of a dense 3D map that accurately represents the static background +only. The second component extends the octree structure to support +multi-resolution representation. This extension not only enhances +reconstruction quality but also aids in the removal of dynamic objects +identified by the first module. Additionally, Fourier feature encoding is +applied to the sampled points, capturing high-frequency information and leading +to more complete reconstruction results. Evaluations on various datasets +demonstrate that our method achieves more competitive results compared to +current state-of-the-art approaches. + +
+
+
+
+
+ + ☆ AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with + Alternative Modality Masking NeurIPS 2024 + + +
+ Camera-LiDAR fusion models significantly enhance perception performance in +autonomous driving. The fusion mechanism leverages the strengths of each +modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR +fusion models utilize pre-trained backbones for efficient training. However, we +argue that directly loading single-modal pre-trained camera and LiDAR backbones +into camera-LiDAR fusion models introduces similar feature redundancy across +modalities due to the nature of the fusion mechanism. Unfortunately, existing +pruning methods are developed explicitly for single-modal models, and thus, +they struggle to effectively identify these specific redundant parameters in +camera-LiDAR fusion models. In this paper, to address the issue above on +camera-LiDAR fusion models, we propose a novelty pruning framework Alternative +Modality Masking Pruning (AlterMOMA), which employs alternative masking on each +modality and identifies the redundant parameters. Specifically, when one +modality parameters are masked (deactivated), the absence of features from the +masked backbone compels the model to reactivate previous redundant features of +the other modality backbone. Therefore, these redundant features and relevant +redundant parameters can be identified via the reactivation process. The +redundant parameters can be pruned by our proposed importance score evaluation +function, Alternative Evaluation (AlterEva), which is based on the observation +of the loss changes when certain modality parameters are activated and +deactivated. Extensive experiments on the nuScene and KITTI datasets +encompassing diverse tasks, baseline models, and pruning algorithms showcase +that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art +performance. + +
+
+ comment: 17 pages, 3 figures, Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Robotic-CLIP: Fine-tuning CLIP on Action Data for Robotic Applications + + +
+ Vision language models have played a key role in extracting meaningful +features for various robotic applications. Among these, Contrastive +Language-Image Pretraining (CLIP) is widely used in robotic tasks that require +both vision and natural language understanding. However, CLIP was trained +solely on static images paired with text prompts and has not yet been fully +adapted for robotic tasks involving dynamic actions. In this paper, we +introduce Robotic-CLIP to enhance robotic perception capabilities. We first +gather and label large-scale action data, and then build our Robotic-CLIP by +fine-tuning CLIP on 309,433 videos (~7.4 million frames) of action data using +contrastive learning. By leveraging action data, Robotic-CLIP inherits CLIP's +strong image performance while gaining the ability to understand actions in +robotic contexts. Intensive experiments show that our Robotic-CLIP outperforms +other CLIP-based models across various language-driven robotic tasks. +Additionally, we demonstrate the practical effectiveness of Robotic-CLIP in +real-world grasping applications. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Scene Understanding in Pick-and-Place Tasks: Analyzing Transformations + Between Initial and Final Scenes + + +
+ With robots increasingly collaborating with humans in everyday tasks, it is +important to take steps toward robotic systems capable of understanding the +environment. This work focuses on scene understanding to detect pick and place +tasks given initial and final images from the scene. To this end, a dataset is +collected for object detection and pick and place task detection. A YOLOv5 +network is subsequently trained to detect the objects in the initial and final +scenes. Given the detected objects and their bounding boxes, two methods are +proposed to detect the pick and place tasks which transform the initial scene +into the final scene. A geometric method is proposed which tracks objects' +movements in the two scenes and works based on the intersection of the bounding +boxes which moved within scenes. Contrarily, the CNN-based method utilizes a +Convolutional Neural Network to classify objects with intersected bounding +boxes into 5 classes, showing the spatial relationship between the involved +objects. The performed pick and place tasks are then derived from analyzing the +experiments with both scenes. Results show that the CNN-based method, using a +VGG16 backbone, outscores the geometric method by roughly 12 percentage points +in certain scenarios, with an overall success rate of 84.3%. + +
+
+ comment: Conference Paper, ICEE 2024, 7 pages, 5 figures +
+
+
+
+
+ + ☆ Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit + + +
+ In this paper, we introduce Behavior4All, a comprehensive, open-source +toolkit for in-the-wild facial behavior analysis, integrating Face +Localization, Valence-Arousal Estimation, Basic Expression Recognition and +Action Unit Detection, all within a single framework. Available in both +CPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale, +in-the-wild datasets consisting of over 5 million images from diverse +demographic groups. It introduces a novel framework that leverages distribution +matching and label co-annotation to address tasks with non-overlapping +annotations, encoding prior knowledge of their relatedness. In the largest +study of its kind, Behavior4All outperforms both state-of-the-art and toolkits +in overall performance as well as fairness across all databases and tasks. It +also demonstrates superior generalizability on unseen databases and on compound +expression recognition. Finally, Behavior4All is way times faster than other +toolkits. + +
+
+
+
+
+ + ☆ MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling NeurIPS 2024 + + +
+ Motion generation from discrete quantization offers many advantages over +continuous regression, but at the cost of inevitable approximation errors. +Previous methods usually quantize the entire body pose into one code, which not +only faces the difficulty in encoding all joints within one vector but also +loses the spatial relationship between different joints. Differently, in this +work we quantize each individual joint into one vector, which i) simplifies the +quantization process as the complexity associated with a single joint is +markedly lower than that of the entire pose; ii) maintains a spatial-temporal +structure that preserves both the spatial relationships among joints and the +temporal movement patterns; iii) yields a 2D token map, which enables the +application of various 2D operations widely used in 2D images. Grounded in the +2D motion quantization, we build a spatial-temporal modeling framework, where +2D joint VQVAE, temporal-spatial 2D masking technique, and spatial-temporal 2D +attention are proposed to take advantage of spatial-temporal signals among the +2D tokens. Extensive experiments demonstrate that our method significantly +outperforms previous methods across different datasets, with a $26.6\%$ +decrease of FID on HumanML3D and a $29.9\%$ decrease on KIT-ML. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Dark Miner: Defend against unsafe generation for text-to-image diffusion + models + + +
+ Text-to-image diffusion models have been demonstrated with unsafe generation +due to unfiltered large-scale training data, such as violent, sexual, and +shocking images, necessitating the erasure of unsafe concepts. Most existing +methods focus on modifying the generation probabilities conditioned on the +texts containing unsafe descriptions. However, they fail to guarantee safe +generation for unseen texts in the training phase, especially for the prompts +from adversarial attacks. In this paper, we re-analyze the erasure task and +point out that existing methods cannot guarantee the minimization of the total +probabilities of unsafe generation. To tackle this problem, we propose Dark +Miner. It entails a recurring three-stage process that comprises mining, +verifying, and circumventing. It greedily mines embeddings with maximum +generation probabilities of unsafe concepts and reduces unsafe generation more +effectively. In the experiments, we evaluate its performance on two +inappropriate concepts, two objects, and two styles. Compared with 6 previous +state-of-the-art methods, our method achieves better erasure and defense +results in most cases, especially under 4 state-of-the-art attacks, while +preserving the model's native generation capability. Our code will be available +on GitHub. + +
+
+
+
+
+ + ☆ Event-based Stereo Depth Estimation: A Survey + + +
+ Stereopsis has widespread appeal in robotics as it is the predominant way by +which living beings perceive depth to navigate our 3D world. Event cameras are +novel bio-inspired sensors that detect per-pixel brightness changes +asynchronously, with very high temporal resolution and high dynamic range, +enabling machine perception in high-speed motion and broad illumination +conditions. The high temporal precision also benefits stereo matching, making +disparity (depth) estimation a popular research area for event cameras ever +since its inception. Over the last 30 years, the field has evolved rapidly, +from low-latency, low-power circuit design to current deep learning (DL) +approaches driven by the computer vision community. The bibliography is vast +and difficult to navigate for non-experts due its highly interdisciplinary +nature. Past surveys have addressed distinct aspects of this topic, in the +context of applications, or focusing only on a specific class of techniques, +but have overlooked stereo datasets. This survey provides a comprehensive +overview, covering both instantaneous stereo and long-term methods suitable for +simultaneous localization and mapping (SLAM), along with theoretical and +empirical comparisons. It is the first to extensively review DL methods as well +as stereo datasets, even providing practical suggestions for creating new +benchmarks to advance the field. The main advantages and challenges faced by +event-based stereo depth estimation are also discussed. Despite significant +progress, challenges remain in achieving optimal performance in not only +accuracy but also efficiency, a cornerstone of event-based computing. We +identify several gaps and propose future research directions. We hope this +survey inspires future research in this area, by serving as an accessible entry +point for newcomers, as well as a practical guide for seasoned researchers in +the community. + +
+
+ comment: 28 pages, 20 figures, 7 tables +
+
+
+
+
+ + ☆ EM-Net: Efficient Channel and Frequency Learning with Mamba for 3D + Medical Image Segmentation MICCAI 2024 + + +
+ Convolutional neural networks have primarily led 3D medical image +segmentation but may be limited by small receptive fields. Transformer models +excel in capturing global relationships through self-attention but are +challenged by high computational costs at high resolutions. Recently, Mamba, a +state space model, has emerged as an effective approach for sequential +modeling. Inspired by its success, we introduce a novel Mamba-based 3D medical +image segmentation model called EM-Net. It not only efficiently captures +attentive interaction between regions by integrating and selecting channels, +but also effectively utilizes frequency domain to harmonize the learning of +features across varying scales, while accelerating training speed. +Comprehensive experiments on two challenging multi-organ datasets with other +state-of-the-art (SOTA) algorithms show that our method exhibits better +segmentation accuracy while requiring nearly half the parameter size of SOTA +models and 2x faster training speed. + +
+
+ comment: 10 pages, 3 figures, accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Deviation in Latent Representation for + Co-speech Gesture Video Generation + + +
+ Gestures are pivotal in enhancing co-speech communication. While recent works +have mostly focused on point-level motion transformation or fully supervised +motion representations through data-driven approaches, we explore the +representation of gestures in co-speech, with a focus on self-supervised +representation and pixel-level motion deviation, utilizing a diffusion model +which incorporates latent motion features. Our approach leverages +self-supervised deviation in latent representation to facilitate hand gestures +generation, which are crucial for generating realistic gesture videos. Results +of our first experiment demonstrate that our method enhances the quality of +generated videos, with an improvement from 2.7 to 4.5% for FGD, DIV, and FVD, +and 8.1% for PSNR, 2.5% for SSIM over the current state-of-the-art methods. + +
+
+ comment: 5 pages, 5 figures, conference +
+
+
+
+
+ + ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape of a person does not change within a single video. +However, most SOTA human mesh estimation (HME) models output a slightly +different body shape for each video frame, which results in inconsistent body +shapes for the same person. In contrast, we leverage anthropometric +measurements like tailors are already obtaining from humans for centuries. We +create a model called A2B that converts such anthropometric measurements to +body shape parameters of human mesh models. Moreover, we find that finetuned +SOTA 3D human pose estimation (HPE) models outperform HME models regarding the +precision of the estimated keypoints. We show that applying inverse kinematics +(IK) to the results of such a 3D HPE model and combining the resulting body +pose with the A2B body shape leads to superior and consistent human meshes for +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing HME models estimates of +the body shape parameters with A2B model results not only increases the +performance of these HME models, but also leads to consistent body shapes. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Provable Performance Guarantees of Copy Detection Patterns + + +
+ Copy Detection Patterns (CDPs) are crucial elements in modern security +applications, playing a vital role in safeguarding industries such as food, +pharmaceuticals, and cosmetics. Current performance evaluations of CDPs +predominantly rely on empirical setups using simplistic metrics like Hamming +distances or Pearson correlation. These methods are often inadequate due to +their sensitivity to distortions, degradation, and their limitations to +stationary statistics of printing and imaging. Additionally, machine +learning-based approaches suffer from distribution biases and fail to +generalize to unseen counterfeit samples. Given the critical importance of CDPs +in preventing counterfeiting, including the counterfeit vaccines issue +highlighted during the COVID-19 pandemic, there is an urgent need for provable +performance guarantees across various criteria. This paper aims to establish a +theoretical framework to derive optimal criteria for the analysis, +optimization, and future development of CDP authentication technologies, +ensuring their reliability and effectiveness in diverse security scenarios. + +
+
+
+
+
+ + ☆ MECD: Unlocking Multi-Event Causal Discovery in Video Reasoning NeurIPS 2024 + + +
+ Video causal reasoning aims to achieve a high-level understanding of video +content from a causal perspective. However, current video reasoning tasks are +limited in scope, primarily executed in a question-answering paradigm and +focusing on short videos containing only a single event and simple causal +relationships, lacking comprehensive and structured causality analysis for +videos with multiple events. To fill this gap, we introduce a new task and +dataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal +relationships between events distributed chronologically across long videos. +Given visual segments and textual descriptions of events, MECD requires +identifying the causal associations between these events to derive a +comprehensive, structured event-level video causal diagram explaining why and +how the final result event occurred. To address MECD, we devise a novel +framework inspired by the Granger Causality method, using an efficient +mask-based event prediction model to perform an Event Granger Test, which +estimates causality by comparing the predicted result event when premise events +are masked versus unmasked. Furthermore, we integrate causal inference +techniques such as front-door adjustment and counterfactual inference to +address challenges in MECD like causality confounding and illusory causality. +Experiments validate the effectiveness of our framework in providing causal +relationships in multi-event videos, outperforming GPT-4o and VideoLLaVA by +5.7% and 4.1%, respectively. + +
+
+ comment: Accepted at NeurIPS 2024 as a spotlight paper +
+
+
+
+
+ + ☆ P4Q: Learning to Prompt for Quantization in Visual-language Models + + +
+ Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence +in various visual and multimodal tasks, yet the deployment of VLMs on +downstream application platforms remains challenging due to their prohibitive +requirements of training samples and computing resources. Fine-tuning and +quantization of VLMs can substantially reduce the sample and computation costs, +which are in urgent need. There are two prevailing paradigms in quantization, +Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but +incur a huge training cost, while low-bit Post-Training Quantization (PTQ) +suffers from a notable performance drop. We propose a method that balances +fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which +we design a lightweight architecture to leverage contrastive loss supervision +to enhance the recognition performance of a PTQ model. Our method can +effectively reduce the gap between image features and text features caused by +low-bit quantization, based on learnable prompts to reorganize textual +representations and a low-bit adapter to realign the distributions of image and +text features. We also introduce a distillation loss based on cosine similarity +predictions to distill the quantized model using a full-precision teacher. +Extensive experimental results demonstrate that our P4Q method outperforms +prior arts, even achieving comparable results to its full-precision +counterparts. For instance, our 8-bit P4Q can theoretically compress the +CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, +outperforming the learnable prompt fine-tuned full-precision model by 2.24\% +with negligible additional parameters on the ImageNet dataset. + +
+
+
+
+
+ + ☆ Hand-object reconstruction via interaction-aware graph attention + mechanism ICIP 2024 + + +
+ Estimating the poses of both a hand and an object has become an important +area of research due to the growing need for advanced vision computing. The +primary challenge involves understanding and reconstructing how hands and +objects interact, such as contact and physical plausibility. Existing +approaches often adopt a graph neural network to incorporate spatial +information of hand and object meshes. However, these approaches have not fully +exploited the potential of graphs without modification of edges within and +between hand- and object-graphs. We propose a graph-based refinement method +that incorporates an interaction-aware graph-attention mechanism to account for +hand-object interactions. Using edges, we establish connections among closely +correlated nodes, both within individual graphs and across different graphs. +Experiments demonstrate the effectiveness of our proposed method with notable +improvements in the realm of physical plausibility. + +
+
+ comment: 7 pages, Accepted by ICIP 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ ZALM3: Zero-Shot Enhancement of Vision-Language Alignment via In-Context + Information in Multi-Turn Multimodal Medical Dialogue + + +
+ The rocketing prosperity of large language models (LLMs) in recent years has +boosted the prevalence of vision-language models (VLMs) in the medical sector. +In our online medical consultation scenario, a doctor responds to the texts and +images provided by a patient in multiple rounds to diagnose her/his health +condition, forming a multi-turn multimodal medical dialogue format. Unlike +high-quality images captured by professional equipment in traditional medical +visual question answering (Med-VQA), the images in our case are taken by +patients' mobile phones. These images have poor quality control, with issues +such as excessive background elements and the lesion area being significantly +off-center, leading to degradation of vision-language alignment in the model +training phase. In this paper, we propose ZALM3, a Zero-shot strategy to +improve vision-language ALignment in Multi-turn Multimodal Medical dialogue. +Since we observe that the preceding text conversations before an image can +infer the regions of interest (RoIs) in the image, ZALM3 employs an LLM to +summarize the keywords from the preceding context and a visual grounding model +to extract the RoIs. The updated images eliminate unnecessary background noise +and provide more effective vision-language alignment. To better evaluate our +proposed method, we design a new subjective assessment metric for multi-turn +unimodal/multimodal medical dialogue to provide a fine-grained performance +comparison. Our experiments across three different clinical departments +remarkably demonstrate the efficacy of ZALM3 with statistical significance. + +
+
+
+
+
+ + ☆ Appearance Blur-driven AutoEncoder and Motion-guided Memory Module for + Video Anomaly Detection + + +
+ Video anomaly detection (VAD) often learns the distribution of normal samples +and detects the anomaly through measuring significant deviations, but the +undesired generalization may reconstruct a few anomalies thus suppressing the +deviations. Meanwhile, most VADs cannot cope with cross-dataset validation for +new target domains, and few-shot methods must laboriously rely on model-tuning +from the target domain to complete domain adaptation. To address these +problems, we propose a novel VAD method with a motion-guided memory module to +achieve cross-dataset validation with zero-shot. First, we add Gaussian blur to +the raw appearance images, thereby constructing the global pseudo-anomaly, +which serves as the input to the network. Then, we propose multi-scale residual +channel attention to deblur the pseudo-anomaly in normal samples. Next, memory +items are obtained by recording the motion features in the training phase, +which are used to retrieve the motion features from the raw information in the +testing phase. Lastly, our method can ignore the blurred real anomaly through +attention and rely on motion memory items to increase the normality gap between +normal and abnormal motion. Extensive experiments on three benchmark datasets +demonstrate the effectiveness of the proposed method. Compared with +cross-domain methods, our method achieves competitive performance without +adaptation during testing. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ TA-Cleaner: A Fine-grained Text Alignment Backdoor Defense Strategy for + Multimodal Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ☆ Unifying Dimensions: A Linear Adaptive Approach to Lightweight Image + Super-Resolution + + +
+ Window-based transformers have demonstrated outstanding performance in +super-resolution tasks due to their adaptive modeling capabilities through +local self-attention (SA). However, they exhibit higher computational +complexity and inference latency than convolutional neural networks. In this +paper, we first identify that the adaptability of the Transformers is derived +from their adaptive spatial aggregation and advanced structural design, while +their high latency results from the computational costs and memory layout +transformations associated with the local SA. To simulate this aggregation +approach, we propose an effective convolution-based linear focal separable +attention (FSA), allowing for long-range dynamic modeling with linear +complexity. Additionally, we introduce an effective dual-branch structure +combined with an ultra-lightweight information exchange module (IEM) to enhance +the aggregation of information by the Token Mixer. Finally, with respect to the +structure, we modify the existing spatial-gate-based feedforward neural +networks by incorporating a self-gate mechanism to preserve high-dimensional +channel information, enabling the modeling of more complex relationships. With +these advancements, we construct a convolution-based Transformer framework +named the linear adaptive mixer network (LAMNet). Extensive experiments +demonstrate that LAMNet achieves better performance than existing SA-based +Transformer methods while maintaining the computational efficiency of +convolutional neural networks, which can achieve a \(3\times\) speedup of +inference time. The code will be publicly available at: +https://github.com/zononhzy/LAMNet. + +
+
+
+
+
+ + ☆ Improving Fast Adversarial Training via Self-Knowledge Guidance + + +
+ Adversarial training has achieved remarkable advancements in defending +against adversarial attacks. Among them, fast adversarial training (FAT) is +gaining attention for its ability to achieve competitive robustness with fewer +computing resources. Existing FAT methods typically employ a uniform strategy +that optimizes all training data equally without considering the influence of +different examples, which leads to an imbalanced optimization. However, this +imbalance remains unexplored in the field of FAT. In this paper, we conduct a +comprehensive study of the imbalance issue in FAT and observe an obvious class +disparity regarding their performances. This disparity could be embodied from a +perspective of alignment between clean and robust accuracy. Based on the +analysis, we mainly attribute the observed misalignment and disparity to the +imbalanced optimization in FAT, which motivates us to optimize different +training data adaptively to enhance robustness. Specifically, we take disparity +and misalignment into consideration. First, we introduce self-knowledge guided +regularization, which assigns differentiated regularization weights to each +class based on its training state, alleviating class disparity. Additionally, +we propose self-knowledge guided label relaxation, which adjusts label +relaxation according to the training accuracy, alleviating the misalignment and +improving robustness. By combining these methods, we formulate the +Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge +during training to enhance the adversarial robustness without compromising +training efficiency. Extensive experiments on four standard datasets +demonstrate that the SKG-FAT improves the robustness and preserves competitive +clean accuracy, outperforming the state-of-the-art methods. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ ID$^3$: Identity-Preserving-yet-Diversified Diffusion Models for + Synthetic Face Recognition NeurIPS 2024 + + +
+ Synthetic face recognition (SFR) aims to generate synthetic face datasets +that mimic the distribution of real face data, which allows for training face +recognition models in a privacy-preserving manner. Despite the remarkable +potential of diffusion models in image generation, current diffusion-based SFR +models struggle with generalization to real-world faces. To address this +limitation, we outline three key objectives for SFR: (1) promoting diversity +across identities (inter-class diversity), (2) ensuring diversity within each +identity by injecting various facial attributes (intra-class diversity), and +(3) maintaining identity consistency within each identity group (intra-class +identity preservation). Inspired by these goals, we introduce a +diffusion-fueled SFR model termed $\text{ID}^3$. $\text{ID}^3$ employs an +ID-preserving loss to generate diverse yet identity-consistent facial +appearances. Theoretically, we show that minimizing this loss is equivalent to +maximizing the lower bound of an adjusted conditional log-likelihood over +ID-preserving data. This equivalence motivates an ID-preserving sampling +algorithm, which operates over an adjusted gradient vector field, enabling the +generation of fake face recognition datasets that approximate the distribution +of real-world faces. Extensive experiments across five challenging benchmarks +validate the advantages of $\text{ID}^3$. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Flexiffusion: Segment-wise Neural Architecture Search for Flexible + Denoising Schedule + + +
+ Diffusion models are cutting-edge generative models adept at producing +diverse, high-quality images. Despite their effectiveness, these models often +require significant computational resources owing to their numerous sequential +denoising steps and the significant inference cost of each step. Recently, +Neural Architecture Search (NAS) techniques have been employed to automatically +search for faster generation processes. However, NAS for diffusion is +inherently time-consuming as it requires estimating thousands of diffusion +models to search for the optimal one. In this paper, we introduce Flexiffusion, +a novel training-free NAS paradigm designed to accelerate diffusion models by +concurrently optimizing generation steps and network structures. Specifically, +we partition the generation process into isometric step segments, each +sequentially composed of a full step, multiple partial steps, and several null +steps. The full step computes all network blocks, while the partial step +involves part of the blocks, and the null step entails no computation. +Flexiffusion autonomously explores flexible step combinations for each segment, +substantially reducing search costs and enabling greater acceleration compared +to the state-of-the-art (SOTA) method for diffusion models. Our searched models +reported speedup factors of $2.6\times$ and $1.5\times$ for the original +LDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and +the SOTA are $5.1\times$ and $2.0\times$. We also verified the performance of +Flexiffusion on multiple datasets, and positive experiment results indicate +that Flexiffusion can effectively reduce redundancy in diffusion models. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ General Compression Framework for Efficient Transformer Object Tracking + + +
+ Transformer-based trackers have established a dominant role in the field of +visual object tracking. While these trackers exhibit promising performance, +their deployment on resource-constrained devices remains challenging due to +inefficiencies. To improve the inference efficiency and reduce the computation +cost, prior approaches have aimed to either design lightweight trackers or +distill knowledge from larger teacher models into more compact student +trackers. However, these solutions often sacrifice accuracy for speed. Thus, we +propose a general model compression framework for efficient transformer object +tracking, named CompressTracker, to reduce the size of a pre-trained tracking +model into a lightweight tracker with minimal performance degradation. Our +approach features a novel stage division strategy that segments the transformer +layers of the teacher model into distinct stages, enabling the student model to +emulate each corresponding teacher stage more effectively. Additionally, we +also design a unique replacement training technique that involves randomly +substituting specific stages in the student model with those from the teacher +model, as opposed to training the student model in isolation. Replacement +training enhances the student model's ability to replicate the teacher model's +behavior. To further forcing student model to emulate teacher model, we +incorporate prediction guidance and stage-wise feature mimicking to provide +additional supervision during the teacher model's compression process. Our +framework CompressTracker is structurally agnostic, making it compatible with +any transformer architecture. We conduct a series of experiment to verify the +effectiveness and generalizability of CompressTracker. Our CompressTracker-4 +with 4 transformer layers, which is compressed from OSTrack, retains about 96% +performance on LaSOT (66.1% AUC) while achieves 2.17x speed up. + +
+
+
+
+
+ + ☆ Dynamic Subframe Splitting and Spatio-Temporal Motion Entangled Sparse + Attention for RGB-E Tracking + + +
+ Event-based bionic camera asynchronously captures dynamic scenes with high +temporal resolution and high dynamic range, offering potential for the +integration of events and RGB under conditions of illumination degradation and +fast motion. Existing RGB-E tracking methods model event characteristics +utilising attention mechanism of Transformer before integrating both +modalities. Nevertheless, these methods involve aggregating the event stream +into a single event frame, lacking the utilisation of the temporal information +inherent in the event stream.Moreover, the traditional attention mechanism is +well-suited for dense semantic features, while the attention mechanism for +sparse event features require revolution. In this paper, we propose a dynamic +event subframe splitting strategy to split the event stream into more +fine-grained event clusters, aiming to capture spatio-temporal features that +contain motion cues. Based on this, we design an event-based sparse attention +mechanism to enhance the interaction of event features in temporal and spatial +dimensions. The experimental results indicate that our method outperforms +existing state-of-the-art methods on the FE240 and COESOT datasets, providing +an effective processing manner for the event data. + +
+
+ comment: 15 pages, 8 figures, conference +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ Triple Point Masking + + +
+ Existing 3D mask learning methods encounter performance bottlenecks under +limited data, and our objective is to overcome this limitation. In this paper, +we introduce a triple point masking scheme, named TPM, which serves as a +scalable framework for pre-training of masked autoencoders to achieve +multi-mask learning for 3D point clouds. Specifically, we augment the baselines +with two additional mask choices (i.e., medium mask and low mask) as our core +insight is that the recovery process of an object can manifest in diverse ways. +Previous high-masking schemes focus on capturing the global representation but +lack the fine-grained recovery capability, so that the generated pre-trained +weights tend to play a limited role in the fine-tuning process. With the +support of the proposed TPM, available methods can exhibit more flexible and +accurate completion capabilities, enabling the potential autoencoder in the +pre-training stage to consider multiple representations of a single 3D object. +In addition, an SVM-guided weight selection module is proposed to fill the +encoder parameters for downstream networks with the optimal weight during the +fine-tuning stage, maximizing linear accuracy and facilitating the acquisition +of intricate representations for new objects. Extensive experiments show that +the four baselines equipped with the proposed TPM achieve comprehensive +performance improvements on various downstream tasks. + +
+
+
+
+
+ + ☆ CAMOT: Camera Angle-aware Multi-Object Tracking + + +
+ This paper proposes CAMOT, a simple camera angle estimator for multi-object +tracking to tackle two problems: 1) occlusion and 2) inaccurate distance +estimation in the depth direction. Under the assumption that multiple objects +are located on a flat plane in each video frame, CAMOT estimates the camera +angle using object detection. In addition, it gives the depth of each object, +enabling pseudo-3D MOT. We evaluated its performance by adding it to various 2D +MOT methods on the MOT17 and MOT20 datasets and confirmed its effectiveness. +Applying CAMOT to ByteTrack, we obtained 63.8% HOTA, 80.6% MOTA, and 78.5% IDF1 +in MOT17, which are state-of-the-art results. Its computational cost is +significantly lower than the existing deep-learning-based depth estimators for +tracking. + +
+
+
+
+
+ + ☆ SimVG: A Simple Framework for Visual Grounding with Decoupled + Multi-modal Fusion NeurIPS2024 + + +
+ Visual grounding is a common vision task that involves grounding descriptive +sentences to the corresponding regions of an image. Most existing methods use +independent image-text encoding and apply complex hand-crafted modules or +encoder-decoder architectures for modal interaction and query reasoning. +However, their performance significantly drops when dealing with complex +textual expressions. This is because the former paradigm only utilizes limited +downstream data to fit the multi-modal feature fusion. Therefore, it is only +effective when the textual expressions are relatively simple. In contrast, +given the wide diversity of textual expressions and the uniqueness of +downstream training data, the existing fusion module, which extracts multimodal +content from a visual-linguistic context, has not been fully investigated. In +this paper, we present a simple yet robust transformer-based framework, SimVG, +for visual grounding. Specifically, we decouple visual-linguistic feature +fusion from downstream tasks by leveraging existing multimodal pre-trained +models and incorporating additional object tokens to facilitate deep +integration of downstream and pre-training tasks. Furthermore, we design a +dynamic weight-balance distillation method in the multi-branch synchronous +learning process to enhance the representation capability of the simpler +branch. This branch only consists of a lightweight MLP, which simplifies the +structure and improves reasoning speed. Experiments on six widely used VG +datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the +superiority of SimVG. Finally, the proposed method not only achieves +improvements in efficiency and convergence speed but also attains new +state-of-the-art performance on these benchmarks. Codes and models will be +available at \url{https://github.com/Dmmm1997/SimVG}. + +
+
+ comment: 21pages, 11figures, NeurIPS2024 +
+
+
+
+
+ + ☆ Drone Stereo Vision for Radiata Pine Branch Detection and Distance + Measurement: Integrating SGBM and Segmentation Models + + +
+ Manual pruning of radiata pine trees presents significant safety risks due to +their substantial height and the challenging terrains in which they thrive. To +address these risks, this research proposes the development of a drone-based +pruning system equipped with specialized pruning tools and a stereo vision +camera, enabling precise detection and trimming of branches. Deep learning +algorithms, including YOLO and Mask R-CNN, are employed to ensure accurate +branch detection, while the Semi-Global Matching algorithm is integrated to +provide reliable distance estimation. The synergy between these techniques +facilitates the precise identification of branch locations and enables +efficient, targeted pruning. Experimental results demonstrate that the combined +implementation of YOLO and SGBM enables the drone to accurately detect branches +and measure their distances from the drone. This research not only improves the +safety and efficiency of pruning operations but also makes a significant +contribution to the advancement of drone technology in the automation of +agricultural and forestry practices, laying a foundational framework for +further innovations in environmental management. + +
+
+
+
+
+ + ☆ JoyType: A Robust Design for Multilingual Visual Text Creation AAAI 2025 + + +
+ Generating images with accurately represented text, especially in non-Latin +languages, poses a significant challenge for diffusion models. Existing +approaches, such as the integration of hint condition diagrams via auxiliary +networks (e.g., ControlNet), have made strides towards addressing this issue. +However, diffusion models often fall short in tasks requiring controlled text +generation, such as specifying particular fonts or producing text in small +fonts. In this paper, we introduce a novel approach for multilingual visual +text creation, named JoyType, designed to maintain the font style of text +during the image generation process. Our methodology begins with assembling a +training dataset, JoyType-1M, comprising 1 million pairs of data. Each pair +includes an image, its description, and glyph instructions corresponding to the +font style within the image. We then developed a text control network, Font +ControlNet, tasked with extracting font style information to steer the image +generation. To further enhance our model's ability to maintain font style, +notably in generating small-font text, we incorporated a multi-layer OCR-aware +loss into the diffusion process. This enhancement allows JoyType to direct text +rendering using low-level descriptors. Our evaluations, based on both visual +and accuracy metrics, demonstrate that JoyType significantly outperforms +existing state-of-the-art methods. Additionally, JoyType can function as a +plugin, facilitating the creation of varied image styles in conjunction with +other stable diffusion models on HuggingFace and CivitAI. Our project is +open-sourced on https://jdh-algo.github.io/JoyType/. + +
+
+ comment: Under Review at AAAI 2025 +
+
+
+
+
+ + ☆ EAGLE: Egocentric AGgregated Language-video Engine + + +
+ The rapid evolution of egocentric video analysis brings new insights into +understanding human activities and intentions from a first-person perspective. +Despite this progress, the fragmentation in tasks like action recognition, +procedure learning, and moment retrieval, \etc, coupled with inconsistent +annotations and isolated model development, hinders a holistic interpretation +of video content. In response, we introduce the EAGLE (Egocentric AGgregated +Language-video Engine) model and the EAGLE-400K dataset to provide a unified +framework that integrates various egocentric video understanding tasks. +EAGLE-400K, the \textit{first} large-scale instruction-tuning dataset tailored +for egocentric video, features 400K diverse samples to enhance a broad spectrum +of tasks from activity recognition to procedure knowledge learning. Moreover, +EAGLE, a strong video multimodal large language model (MLLM), is designed to +effectively capture both spatial and temporal information. In addition, we +propose a set of evaluation metrics designed to facilitate a thorough +assessment of MLLM for egocentric video understanding. Our extensive +experiments demonstrate EAGLE's superior performance over existing models, +highlighting its ability to balance task-specific understanding with holistic +video interpretation. With EAGLE, we aim to pave the way for research +opportunities and practical applications in real-world scenarios. + +
+
+ comment: Accepted by ACMMM 24 +
+
+
+
+
+ + ☆ Robotic Environmental State Recognition with Pre-Trained Vision-Language + Models and Black-Box Optimization + + +
+ In order for robots to autonomously navigate and operate in diverse +environments, it is essential for them to recognize the state of their +environment. On the other hand, the environmental state recognition has +traditionally involved distinct methods tailored to each state to be +recognized. In this study, we perform a unified environmental state recognition +for robots through the spoken language with pre-trained large-scale +vision-language models. We apply Visual Question Answering and Image-to-Text +Retrieval, which are tasks of Vision-Language Models. We show that with our +method, it is possible to recognize not only whether a room door is +open/closed, but also whether a transparent door is open/closed and whether +water is running in a sink, without training neural networks or manual +programming. In addition, the recognition accuracy can be improved by selecting +appropriate texts from the set of prepared texts based on black-box +optimization. For each state recognition, only the text set and its weighting +need to be changed, eliminating the need to prepare multiple different models +and programs, and facilitating the management of source code and computer +resource. We experimentally demonstrate the effectiveness of our method and +apply it to the recognition behavior on a mobile robot, Fetch. + +
+
+ comment: Accepted at Advanced Robotics, website - + https://haraduka.github.io/vlm-bbo/ +
+
+
+
+
+ + ☆ SCOMatch: Alleviating Overtrusting in Open-set Semi-supervised Learning ECCV 2024 + + +
+ Open-set semi-supervised learning (OSSL) leverages practical open-set +unlabeled data, comprising both in-distribution (ID) samples from seen classes +and out-of-distribution (OOD) samples from unseen classes, for semi-supervised +learning (SSL). Prior OSSL methods initially learned the decision boundary +between ID and OOD with labeled ID data, subsequently employing self-training +to refine this boundary. These methods, however, suffer from the tendency to +overtrust the labeled ID data: the scarcity of labeled data caused the +distribution bias between the labeled samples and the entire ID data, which +misleads the decision boundary to overfit. The subsequent self-training +process, based on the overfitted result, fails to rectify this problem. In this +paper, we address the overtrusting issue by treating OOD samples as an +additional class, forming a new SSL process. + Specifically, we propose SCOMatch, a novel OSSL method that 1) selects +reliable OOD samples as new labeled data with an OOD memory queue and a +corresponding update strategy and 2) integrates the new SSL process into the +original task through our Simultaneous Close-set and Open-set self-training. +SCOMatch refines the decision boundary of ID and OOD classes across the entire +dataset, thereby leading to improved results. Extensive experimental results +show that SCOMatch significantly outperforms the state-of-the-art methods on +various benchmarks. The effectiveness is further verified through ablation +studies and visualization. + +
+
+ comment: ECCV 2024 accepted +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Shape-intensity knowledge distillation for robust medical image + segmentation + + +
+ Many medical image segmentation methods have achieved impressive results. +Yet, most existing methods do not take into account the shape-intensity prior +information. This may lead to implausible segmentation results, in particular +for images of unseen datasets. In this paper, we propose a novel approach to +incorporate joint shape-intensity prior information into the segmentation +network. Specifically, we first train a segmentation network (regarded as the +teacher network) on class-wise averaged training images to extract valuable +shape-intensity information, which is then transferred to a student +segmentation network with the same network architecture as the teacher via +knowledge distillation. In this way, the student network regarded as the final +segmentation model can effectively integrate the shape-intensity prior +information, yielding more accurate segmentation results. Despite its +simplicity, experiments on five medical image segmentation tasks of different +modalities demonstrate that the proposed Shape-Intensity Knowledge Distillation +(SIKD) consistently improves several baseline models (including recent MaxStyle +and SAMed) under intra-dataset evaluation, and significantly improves the +cross-dataset generalization ability. The code is available at +https://github.com/whdong-whu/SIKD. + +
+
+
+
+
+ + ☆ Learning Quantized Adaptive Conditions for Diffusion Models + + +
+ The curvature of ODE trajectories in diffusion models hinders their ability +to generate high-quality images in a few number of function evaluations (NFE). +In this paper, we propose a novel and effective approach to reduce trajectory +curvature by utilizing adaptive conditions. By employing a extremely +light-weight quantized encoder, our method incurs only an additional 1% of +training parameters, eliminates the need for extra regularization terms, yet +achieves significantly better sample quality. Our approach accelerates ODE +sampling while preserving the downstream task image editing capabilities of SDE +techniques. Extensive experiments verify that our method can generate high +quality results under extremely limited sampling costs. With only 6 NFE, we +achieve 5.14 FID on CIFAR-10, 6.91 FID on FFHQ 64x64 and 3.10 FID on AFHQv2. + +
+
+
+
+
+ + ☆ Global-Local Medical SAM Adaptor Based on Full Adaption + + +
+ Emerging of visual language models, such as the segment anything model (SAM), +have made great breakthroughs in the field of universal semantic segmentation +and significantly aid the improvements of medical image segmentation, in +particular with the help of Medical SAM adaptor (Med-SA). However, Med-SA still +can be improved, as it fine-tunes SAM in a partial adaption manner. To resolve +this problem, we present a novel global medical SAM adaptor (GMed-SA) with full +adaption, which can adapt SAM globally. We further combine GMed-SA and Med-SA +to propose a global-local medical SAM adaptor (GLMed-SA) to adapt SAM both +globally and locally. Extensive experiments have been performed on the +challenging public 2D melanoma segmentation dataset. The results show that +GLMed-SA outperforms several state-of-the-art semantic segmentation methods on +various evaluation metrics, demonstrating the superiority of our methods. + +
+
+
+
+
+ + ☆ Revisiting Deep Ensemble Uncertainty for Enhanced Medical Anomaly + Detection MICCAI2024 + + +
+ Medical anomaly detection (AD) is crucial in pathological identification and +localization. Current methods typically rely on uncertainty estimation in deep +ensembles to detect anomalies, assuming that ensemble learners should agree on +normal samples while exhibiting disagreement on unseen anomalies in the output +space. However, these methods may suffer from inadequate disagreement on +anomalies or diminished agreement on normal samples. To tackle these issues, we +propose D2UE, a Diversified Dual-space Uncertainty Estimation framework for +medical anomaly detection. To effectively balance agreement and disagreement +for anomaly detection, we propose Redundancy-Aware Repulsion (RAR), which uses +a similarity kernel that remains invariant to both isotropic scaling and +orthogonal transformations, explicitly promoting diversity in learners' feature +space. Moreover, to accentuate anomalous regions, we develop Dual-Space +Uncertainty (DSU), which utilizes the ensemble's uncertainty in input and +output spaces. In input space, we first calculate gradients of reconstruction +error with respect to input images. The gradients are then integrated with +reconstruction outputs to estimate uncertainty for inputs, enabling effective +anomaly discrimination even when output space disagreement is minimal. We +conduct a comprehensive evaluation of five medical benchmarks with different +backbones. Experimental results demonstrate the superiority of our method to +state-of-the-art methods and the effectiveness of each component in our +framework. Our code is available at https://github.com/Rubiscol/D2UE. + +
+
+ comment: Early accepted by MICCAI2024 +
+
+
+
+
+ + ♻ ☆ Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with + Enhanced Generalization and Personalization Abilities WACV 2025 + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant +potential for modeling 3D head avatars, providing greater flexibility than +mesh-based methods and more efficient rendering compared to NeRF-based +approaches. Despite these advancements, the creation of controllable 3DGS-based +head avatars remains time-intensive, often requiring tens of minutes to hours. +To expedite this process, we here introduce the ``Gaussian D\'ej\`a-vu" +framework, which first obtains a generalized model of the head avatar and then +personalizes the result. The generalized model is trained on large 2D +(synthetic and real) image datasets. This model provides a well-initialized 3D +Gaussian head that is further refined using a monocular video to achieve the +personalized head avatar. For personalizing, we propose learnable +expression-aware rectification blendmaps to correct the initial 3D Gaussians, +ensuring rapid convergence without the reliance on neural networks. Experiments +demonstrate that the proposed method meets its objectives. It outperforms +state-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as +well as reduces training time consumption to at least a quarter of the existing +methods, producing the avatar in minutes. + +
+
+ comment: 11 pages, Accepted by WACV 2025 in Round 1 +
+
+
+
+
+ + ♻ ☆ Chat-Scene: Bridging 3D Scene and Large Language Models with Object + Identifiers + + +
+ Recent advancements in 3D Large Language Models (LLMs) have demonstrated +promising capabilities for 3D scene understanding. However, previous methods +exhibit deficiencies in general referencing and grounding capabilities for +intricate scene comprehension. In this paper, we introduce the use of object +identifiers and object-centric representations to interact with scenes at the +object level. Specifically, we decompose the input 3D scene into a set of +object proposals, each assigned a unique identifier token, which enables +efficient object referencing and grounding during user-assistant interactions. +Given the scarcity of scene-language data, we model the scene embeddings as a +sequence of explicit object-level embeddings, derived from semantic-rich 2D or +3D representations. By employing object identifiers, we transform diverse 3D +scene-language tasks into a unified question-answering format, facilitating +joint training without the need for additional task-specific heads. With +minimal fine-tuning on all downstream tasks, our model significantly +outperforms existing methods on benchmarks including ScanRefer, Multi3DRefer, +Scan2Cap, ScanQA, and SQA3D. + +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Synthesizing Environment-Specific People in Photographs ECCV 2024 + + +
+ We present ESP, a novel method for context-aware full-body generation, that +enables photo-realistic synthesis and inpainting of people wearing clothing +that is semantically appropriate for the scene depicted in an input photograph. +ESP is conditioned on a 2D pose and contextual cues that are extracted from the +photograph of the scene and integrated into the generation process, where the +clothing is modeled explicitly with human parsing masks (HPM). Generated HPMs +are used as tight guiding masks for inpainting, such that no changes are made +to the original background. Our models are trained on a dataset containing a +set of in-the-wild photographs of people covering a wide range of different +environments. The method is analyzed quantitatively and qualitatively, and we +show that ESP outperforms the state-of-the-art on the task of contextual +full-body generation. + +
+
+ comment: Accepted at ECCV 2024, Project: https://esp.is.tue.mpg.de +
+
+
+
+
+ + ♻ ☆ Valeo4Cast: A Modular Approach to End-to-End Forecasting ECCV + + +
+ Motion forecasting is crucial in autonomous driving systems to anticipate the +future trajectories of surrounding agents such as pedestrians, vehicles, and +traffic signals. In end-to-end forecasting, the model must jointly detect and +track from sensor data (cameras or LiDARs) the past trajectories of the +different elements of the scene and predict their future locations. We depart +from the current trend of tackling this task via end-to-end training from +perception to forecasting, and instead use a modular approach. We individually +build and train detection, tracking and forecasting modules. We then only use +consecutive finetuning steps to integrate the modules better and alleviate +compounding errors. We conduct an in-depth study on the finetuning strategies +and it reveals that our simple yet effective approach significantly improves +performance on the end-to-end forecasting benchmark. Consequently, our solution +ranks first in the Argoverse 2 End-to-end Forecasting Challenge, with 63.82 +mAPf. We surpass forecasting results by +17.1 points over last year's winner +and by +13.3 points over this year's runner-up. This remarkable performance in +forecasting can be explained by our modular paradigm, which integrates +finetuning strategies and significantly outperforms the end-to-end-trained +counterparts. The code, model weights and results are made available +https://github.com/valeoai/valeo4cast. + +
+
+ comment: Winning solution of the Argoverse 2 "Unified Detection, Tracking, and + Forecasting" challenge; work accepted at Road++ ECCVW 2024 +
+
+
+
+
+ + ♻ ☆ Disentangled Clothed Avatar Generation from Text Descriptions + + +
+ In this paper, we introduce a novel text-to-avatar generation method that +separately generates the human body and the clothes and allows high-quality +animation on the generated avatar. While recent advancements in text-to-avatar +generation have yielded diverse human avatars from text prompts, these methods +typically combine all elements-clothes, hair, and body-into a single 3D +representation. Such an entangled approach poses challenges for downstream +tasks like editing or animation. To overcome these limitations, we propose a +novel disentangled 3D avatar representation named Sequentially Offset-SMPL +(SO-SMPL), building upon the SMPL model. SO-SMPL represents the human body and +clothes with two separate meshes but associates them with offsets to ensure the +physical alignment between the body and the clothes. Then, we design a Score +Distillation Sampling (SDS)-based distillation framework to generate the +proposed SO-SMPL representation from text prompts. Our approach not only +achieves higher texture and geometry quality and better semantic alignment with +text prompts, but also significantly improves the visual quality of character +animation, virtual try-on, and avatar editing. Project page: +https://shanemankiw.github.io/SO-SMPL/. + +
+
+ comment: Project page: https://shanemankiw.github.io/SO-SMPL/ +
+
+
+
+
+ + ♻ ☆ Jumping through Local Minima: Quantization in the Loss Landscape of + Vision Transformers + + +
+ Quantization scale and bit-width are the most important parameters when +considering how to quantize a neural network. Prior work focuses on optimizing +quantization scales in a global manner through gradient methods (gradient +descent \& Hessian analysis). Yet, when applying perturbations to quantization +scales, we observe a very jagged, highly non-smooth test loss landscape. In +fact, small perturbations in quantization scale can greatly affect accuracy, +yielding a $0.5-0.8\%$ accuracy boost in 4-bit quantized vision transformers +(ViTs). In this regime, gradient methods break down, since they cannot reliably +reach local minima. In our work, dubbed Evol-Q, we use evolutionary search to +effectively traverse the non-smooth landscape. Additionally, we propose using +an infoNCE loss, which not only helps combat overfitting on the small +calibration dataset ($1,000$ images) but also makes traversing such a highly +non-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully +quantized ViT-Base by $10.30\%$, $0.78\%$, and $0.15\%$ for $3$-bit, $4$-bit, +and $8$-bit weight quantization levels. Extensive experiments on a variety of +CNN and ViT architectures further demonstrate its robustness in extreme +quantization scenarios. Our code is available at +https://github.com/enyac-group/evol-q + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.09643 +
+
+
+
+
+ + ♻ ☆ LingoQA: Visual Question Answering for Autonomous Driving ECCV 2024 + + +
+ We introduce LingoQA, a novel dataset and benchmark for visual question +answering in autonomous driving. The dataset contains 28K unique short video +scenarios, and 419K annotations. Evaluating state-of-the-art vision-language +models on our benchmark shows that their performance is below human +capabilities, with GPT-4V responding truthfully to 59.6% of the questions +compared to 96.6% for humans. For evaluation, we propose a truthfulness +classifier, called Lingo-Judge, that achieves a 0.95 Spearman correlation +coefficient to human evaluations, surpassing existing techniques like METEOR, +BLEU, CIDEr, and GPT-4. We establish a baseline vision-language model and run +extensive ablation studies to understand its performance. We release our +dataset and benchmark as an evaluation platform for vision-language models in +autonomous driving. + +
+
+ comment: Accepted to ECCV 2024. Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Manydepth2: Motion-Aware Self-Supervised Monocular Depth Estimation in + Dynamic Scenes + + +
+ Despite advancements in self-supervised monocular depth estimation, +challenges persist in dynamic scenarios due to the dependence on assumptions +about a static world. In this paper, we present Manydepth2, a Motion-Guided +Cost Volume Depth Net, to achieve precise depth estimation for both dynamic +objects and static backgrounds, all while maintaining computational efficiency. +To tackle the challenges posed by dynamic content, we incorporate optical flow +and coarse monocular depth to create a novel static reference frame. This frame +is then utilized to build a motion-guided cost volume in collaboration with the +target frame. Additionally, to enhance the accuracy and resilience of the +network structure, we introduce an attention-based depth net architecture to +effectively integrate information from feature maps with varying resolutions. +Compared to methods with similar computational costs, Manydepth2 achieves a +significant reduction of approximately five percent in root-mean-square error +for self-supervised monocular depth estimation on the KITTI-2015 dataset. The +code could be found: https://github.com/kaichen-z/Manydepth2 + +
+
+ comment: Monocular Depth Estimation, Self-Supervised, Optical Flow +
+
+
+
+
+ + ♻ ☆ CollaMamba: Efficient Collaborative Perception with Cross-Agent + Spatial-Temporal State Space Model AAAI 2025 + + +
+ By sharing complementary perceptual information, multi-agent collaborative +perception fosters a deeper understanding of the environment. Recent studies on +collaborative perception mostly utilize CNNs or Transformers to learn feature +representation and fusion in the spatial dimension, which struggle to handle +long-range spatial-temporal features under limited computing and communication +resources. Holistically modeling the dependencies over extensive spatial areas +and extended temporal frames is crucial to enhancing feature quality. To this +end, we propose a resource efficient cross-agent spatial-temporal collaborative +state space model (SSM), named CollaMamba. Initially, we construct a +foundational backbone network based on spatial SSM. This backbone adeptly +captures positional causal dependencies from both single-agent and cross-agent +views, yielding compact and comprehensive intermediate features while +maintaining linear complexity. Furthermore, we devise a history-aware feature +boosting module based on temporal SSM, extracting contextual cues from extended +historical frames to refine vague features while preserving low overhead. +Extensive experiments across several datasets demonstrate that CollaMamba +outperforms state-of-the-art methods, achieving higher model accuracy while +reducing computational and communication overhead by up to 71.9% and 1/64, +respectively. This work pioneers the exploration of the Mamba's potential in +collaborative perception. The source code will be made available. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Computational Trichromacy Reconstruction: Empowering the Color-Vision + Deficient to Recognize Colors Using Augmented Reality + + +
+ We propose an assistive technology that helps individuals with Color Vision +Deficiencies (CVD) to recognize/name colors. A dichromat's color perception is +a reduced two-dimensional (2D) subset of a normal trichromat's three +dimensional color (3D) perception, leading to confusion when visual stimuli +that appear identical to the dichromat are referred to by different color +names. Using our proposed system, CVD individuals can interactively induce +distinct perceptual changes to originally confusing colors via a computational +color space transformation. By combining their original 2D precepts for colors +with the discriminative changes, a three dimensional color space is +reconstructed, where the dichromat can learn to resolve color name confusions +and accurately recognize colors. Our system is implemented as an Augmented +Reality (AR) interface on smartphones, where users interactively control the +rotation through swipe gestures and observe the induced color shifts in the +camera view or in a displayed image. Through psychophysical experiments and a +longitudinal user study, we demonstrate that such rotational color shifts have +discriminative power (initially confusing colors become distinct under +rotation) and exhibit structured perceptual shifts dichromats can learn with +modest training. The AR App is also evaluated in two real-world scenarios +(building with lego blocks and interpreting artistic works); users all report +positive experience in using the App to recognize object colors that they +otherwise could not. + +
+
+
+
+
+ + ♻ ☆ EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS + + +
+ Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view +scene synthesis. It addresses the challenges of lengthy training times and slow +rendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid, +differentiable rasterization of 3D Gaussians, 3D-GS achieves real-time +rendering and accelerated training. They, however, demand substantial memory +resources for both training and storage, as they require millions of Gaussians +in their point cloud representation for each scene. We present a technique +utilizing quantized embeddings to significantly reduce per-point memory storage +requirements and a coarse-to-fine training strategy for a faster and more +stable optimization of the Gaussian point clouds. Our approach develops a +pruning stage which results in scene representations with fewer Gaussians, +leading to faster training times and rendering speeds for real-time rendering +of high resolution scenes. We reduce storage memory by more than an order of +magnitude all while preserving the reconstruction quality. We validate the +effectiveness of our approach on a variety of datasets and scenes preserving +the visual quality while consuming 10-20x lesser memory and faster +training/inference speed. Project page and code is available +https://efficientgaussian.github.io + +
+
+ comment: Website: https://efficientgaussian.github.io Code: + https://github.com/Sharath-girish/efficientgaussian +
+
+
+
+
+ + ♻ ☆ Low-Rank Interconnected Adaptation across Layers + + +
+ Low-rank adaptation (LoRA) is a powerful parameter-efficient fine-tuning +method that utilizes low-rank projectors $A$ and $B$ to learn weight updates +$\Delta W$ for adaptation targets $W$. Previous research has shown that LoRA is +essentially a gradient compressor, performing random projections on the +gradient using a fixed projection matrix $A_0$. However, this setup restricts +the overall weight update to be low-rank, which limits the adaptation +performance. In this paper, we propose low-rank interconnected adaptation +across layers (Lily). Specifically, we employ a hierarchical framework where +low-dimensional projectors (LPs) retained for downward projection at a +particular level, while globally-shared high-dimensional projector (HP) experts +perform upward projection across all levels of layers. Lily uniquely connects +each LP to all HP experts, therefore the gradient projections are no longer +dominated by fixed projection matrices, but rather by selective combinations of +all the projectors, thereby breaking the low-rank constraint of LoRA. +Furthermore, Lily's cross-layer connections facilitate the capture of intricate +information and dependencies across different layers, thereby enhancing the +model's representational capabilities. Experiments across various modalities, +architectures, and model sizes underscore Lily's great performance and +efficiency. Code is available on github https://github.com/yibozhong/lily. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds ICRA + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ SF-MMCN: Low-Power Sever Flow Multi-Mode Diffusion Model Accelerator + + +
+ Generative Artificial Intelligence (AI) has become incredibly popular in +recent years, and the significance of traditional accelerators in dealing with +large-scale parameters is urgent. With the diffusion model's parallel +structure, the hardware design challenge has skyrocketed because of the +multiple layers operating simultaneously. Convolution Neural Network (CNN) +accelerators have been designed and developed rapidly, especially for +high-speed inference. Often, CNN models with parallel structures are deployed. +In these CNN accelerators, many Processing Elements (PE) are required to +perform parallel computations, mainly the multiply and accumulation (MAC) +operation, resulting in high power consumption and a large silicon area. In +this work, a Server Flow Multi-Mode CNN Unit (SF-MMCN) is proposed to reduce +the number of PE while improving the operation efficiency of the CNN +accelerator. The pipelining technique is introduced into Server Flow to process +parallel computations. The proposed SF-MMCN is implemented with TSMC 90-nm CMOS +technology. It is evaluated with VGG-16, ResNet-18, and U-net. The evaluation +results show that the proposed SF-MMCN can reduce the power consumption by 92%, +and the silicon area by 70%, while improving the efficiency of operation by +nearly 81 times. A new FoM, area efficiency (GOPs/mm^2) is also introduced to +evaluate the performance of the accelerator in terms of the ratio throughput +(GOPs) and silicon area (mm^2). In this FoM, SF-MMCN improves area efficiency +by 18 times (18.42). + +
+
+ comment: 16 pages, 16 figures; extend the CNN to process Diffusion Model + (possible this is the first reported hardware Diffusion Model implementation) +
+
+
+
+
+ + ♻ ☆ 2D and 3D Deep Learning Models for MRI-based Parkinson's Disease + Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold + Networks, Convolutional Neural Networks, and Graph Convolutional Networks + + +
+ Parkinson's Disease (PD) diagnosis remains challenging. This study applies +Convolutional Kolmogorov-Arnold Networks (ConvKANs), integrating learnable +spline-based activation functions into convolutional layers, for PD +classification using structural MRI. The first 3D implementation of ConvKANs +for medical imaging is presented, comparing their performance to Convolutional +Neural Networks (CNNs) and Graph Convolutional Networks (GCNs) across three +open-source datasets. Isolated analyses assessed performance within individual +datasets, using cross-validation techniques. Holdout analyses evaluated +cross-dataset generalizability by training models on two datasets and testing +on the third, mirroring real-world clinical scenarios. In isolated analyses, 2D +ConvKANs achieved the highest AUC of 0.99 (95% CI: 0.98-0.99) on the PPMI +dataset, outperforming 2D CNNs (AUC: 0.97, p = 0.0092). 3D models showed +promise, with 3D CNN and 3D ConvKAN reaching an AUC of 0.85 on PPMI. In holdout +analyses, 3D ConvKAN demonstrated superior generalization, achieving an AUC of +0.85 on early-stage PD data. GCNs underperformed in 2D but improved in 3D +implementations. These findings highlight ConvKANs' potential for PD detection, +emphasize the importance of 3D analysis in capturing subtle brain changes, and +underscore cross-dataset generalization challenges. This study advances +AI-assisted PD diagnosis using structural MRI and emphasizes the need for +larger-scale validation. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Enhanced Unsupervised Image-to-Image Translation Using Contrastive + Learning and Histogram of Oriented Gradients + + +
+ Image-to-Image Translation is a vital area of computer vision that focuses on +transforming images from one visual domain to another while preserving their +core content and structure. However, this field faces two major challenges: +first, the data from the two domains are often unpaired, making it difficult to +train generative adversarial networks effectively; second, existing methods +tend to produce artifacts or hallucinations during image generation, leading to +a decline in image quality. To address these issues, this paper proposes an +enhanced unsupervised image-to-image translation method based on the +Contrastive Unpaired Translation (CUT) model, incorporating Histogram of +Oriented Gradients (HOG) features. This novel approach ensures the preservation +of the semantic structure of images, even without semantic labels, by +minimizing the loss between the HOG features of input and generated images. The +method was tested on translating synthetic game environments from GTA5 dataset +to realistic urban scenes in cityscapes dataset, demonstrating significant +improvements in reducing hallucinations and enhancing image quality. + +
+
+ comment: Critical Errors in Data or Analysis +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous + Driving ECCV 2024 + + +
+ The scale-up of autonomous vehicles depends heavily on their ability to deal +with anomalies, such as rare objects on the road. In order to handle such +situations, it is necessary to detect anomalies in the first place. Anomaly +detection for autonomous driving has made great progress in the past years but +suffers from poorly designed benchmarks with a strong focus on camera data. In +this work, we propose AnoVox, the largest benchmark for ANOmaly detection in +autonomous driving to date. AnoVox incorporates large-scale multimodal sensor +data and spatial VOXel ground truth, allowing for the comparison of methods +independent of their used sensor. We propose a formal definition of normality +and provide a compliant training dataset. AnoVox is the first benchmark to +contain both content and temporal anomalies. + +
+
+ comment: Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler + contributed equally. Accepted for publication at ECCV 2024 W-CODA workshop +
+
+
+
+
+ + ♻ ☆ Interpretable Vision-Language Survival Analysis with Ordinal Inductive + Bias for Computational Pathology + + +
+ Histopathology Whole-Slide Images (WSIs) provide an important tool to assess +cancer prognosis in computational pathology (CPATH). While existing survival +analysis (SA) approaches have made exciting progress, they are generally +limited to adopting highly-expressive architectures and only coarse-grained +patient-level labels to learn prognostic visual representations from gigapixel +WSIs. Such learning paradigm suffers from important performance bottlenecks, +when facing present scarce training data and standard multi-instance learning +(MIL) framework in CPATH. To overcome it, this paper, for the first time, +proposes a new Vision-Language-based SA (VLSA) paradigm. Concretely, (1) VLSA +is driven by pathology VL foundation models. It no longer relies on +high-capability networks and shows the advantage of data efficiency. (2) In +vision-end, VLSA encodes prognostic language prior and then employs it as +auxiliary signals to guide the aggregating of prognostic visual features at +instance level, thereby compensating for the weak supervision in MIL. Moreover, +given the characteristics of SA, we propose i) ordinal survival prompt learning +to transform continuous survival labels into textual prompts; and ii) ordinal +incidence function as prediction target to make SA compatible with VL-based +prediction. Notably, VLSA's predictions can be interpreted intuitively by our +Shapley values-based method. The extensive experiments on five datasets confirm +the effectiveness of our scheme. Our VLSA could pave a new way for SA in CPATH +by offering weakly-supervised MIL an effective means to learn valuable +prognostic clues from gigapixel WSIs. Our source code is available at +https://github.com/liupei101/VLSA. + +
+
+ comment: 24 pages, 11 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Fast Sampling Through The Reuse Of Attention Maps In Diffusion Models + + +
+ Text-to-image diffusion models have demonstrated unprecedented capabilities +for flexible and realistic image synthesis. Nevertheless, these models rely on +a time-consuming sampling procedure, which has motivated attempts to reduce +their latency. When improving efficiency, researchers often use the original +diffusion model to train an additional network designed specifically for fast +image generation. In contrast, our approach seeks to reduce latency directly, +without any retraining, fine-tuning, or knowledge distillation. In particular, +we find the repeated calculation of attention maps to be costly yet redundant, +and instead suggest reusing them during sampling. Our specific reuse strategies +are based on ODE theory, which implies that the later a map is reused, the +smaller the distortion in the final image. We empirically compare these reuse +strategies with few-step sampling procedures of comparable latency, finding +that reuse generates images that are closer to those produced by the original +high-latency diffusion model. + +
+
+
+
+
+ + ♻ ☆ ICON: Improving Inter-Report Consistency in Radiology Report Generation + via Lesion-aware Mixup Augmentation + + +
+ Previous research on radiology report generation has made significant +progress in terms of increasing the clinical accuracy of generated reports. In +this paper, we emphasize another crucial quality that it should possess, i.e., +inter-report consistency, which refers to the capability of generating +consistent reports for semantically equivalent radiographs. This quality is +even of greater significance than the overall report accuracy in terms of +ensuring the system's credibility, as a system prone to providing conflicting +results would severely erode users' trust. Regrettably, existing approaches +struggle to maintain inter-report consistency, exhibiting biases towards common +patterns and susceptibility to lesion variants. To address this issue, we +propose ICON, which improves the inter-report consistency of radiology report +generation. Aiming to enhance the system's ability to capture similarities in +semantically equivalent lesions, our approach first involves extracting lesions +from input images and examining their characteristics. Then, we introduce a +lesion-aware mixup technique to ensure that the representations of the +semantically equivalent lesions align with the same attributes, achieved +through a linear combination during the training phase. Extensive experiments +on three publicly available chest X-ray datasets verify the effectiveness of +our approach, both in terms of improving the consistency and accuracy of the +generated reports. + +
+
+
+
+
+ + ♻ ☆ Direct Learning of Mesh and Appearance via 3D Gaussian Splatting + + +
+ Accurately reconstructing a 3D scene including explicit geometry information +is both attractive and challenging. Geometry reconstruction can benefit from +incorporating differentiable appearance models, such as Neural Radiance Fields +and 3D Gaussian Splatting (3DGS). However, existing methods encounter +efficiency issues due to indirect geometry learning and the paradigm of +separately modeling geometry and surface appearance. In this work, we propose a +learnable scene model that incorporates 3DGS with an explicit geometry +representation, namely a mesh. Our model learns the mesh and appearance in an +end-to-end manner, where we bind 3D Gaussians to the mesh faces and perform +differentiable rendering of 3DGS to obtain photometric supervision. The model +creates an effective information pathway to supervise the learning of both 3DGS +and mesh. Experimental results demonstrate that the learned scene model not +only achieves state-of-the-art efficiency and rendering quality but also +supports manipulation using the explicit mesh. In addition, our model has a +unique advantage in adapting to scene updates, thanks to the end-to-end +learning of both mesh and appearance. + +
+
+
+
+
+ + ♻ ☆ Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space + + +
+ Watermarking is a tool for actively identifying and attributing the images +generated by latent diffusion models. Existing methods face the dilemma of +image quality and watermark robustness. Watermarks with superior image quality +usually have inferior robustness against attacks such as blurring and JPEG +compression, while watermarks with superior robustness usually significantly +damage image quality. This dilemma stems from the traditional paradigm where +watermarks are injected and detected in pixel space, relying on pixel +perturbation for watermark detection and resilience against attacks. In this +paper, we highlight that an effective solution to the problem is to both inject +and detect watermarks in the latent diffusion space, and propose Latent +Watermark with a progressive training strategy. It weakens the direct +connection between quality and robustness and thus alleviates their +contradiction. We conduct evaluations on two datasets and against 10 watermark +attacks. Six metrics measure the image quality and watermark robustness. +Results show that compared to the recently proposed methods such as +StableSignature, StegaStamp, RoSteALS, LaWa, TreeRing, and DiffuseTrace, LW not +only surpasses them in terms of robustness but also offers superior image +quality. Our code will be available at +https://github.com/RichardSunnyMeng/LatentWatermark. + +
+
+
+
+
+ + ♻ ☆ Deep Self-Cleansing for Medical Image Segmentation with Noisy Labels + + +
+ Medical image segmentation is crucial in the field of medical imaging, aiding +in disease diagnosis and surgical planning. Most established segmentation +methods rely on supervised deep learning, in which clean and precise labels are +essential for supervision and significantly impact the performance of models. +However, manually delineated labels often contain noise, such as missing labels +and inaccurate boundary delineation, which can hinder networks from correctly +modeling target characteristics. In this paper, we propose a deep +self-cleansing segmentation framework that can preserve clean labels while +cleansing noisy ones in the training phase. To achieve this, we devise a +gaussian mixture model-based label filtering module that distinguishes noisy +labels from clean labels. Additionally, we develop a label cleansing module to +generate pseudo low-noise labels for identified noisy samples. The preserved +clean labels and pseudo-labels are then used jointly to supervise the network. +Validated on a clinical liver tumor dataset and a public cardiac diagnosis +dataset, our method can effectively suppress the interference from noisy labels +and achieve prominent segmentation performance. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MMCode: Benchmarking Multimodal Large Language Models for Code + Generation with Visually Rich Programming Problems EMNLP 2024 + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/likaixin2000/MMCode. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal + Transport + + +
+ Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images +sharing the same category across diverse domains without relying on labeled +data. Prior approaches have typically decomposed the UCIR problem into two +distinct tasks: intra-domain representation learning and cross-domain feature +alignment. However, these segregated strategies overlook the potential +synergies between these tasks. This paper introduces ProtoOT, a novel Optimal +Transport formulation explicitly tailored for UCIR, which integrates +intra-domain feature representation learning and cross-domain alignment into a +unified framework. ProtoOT leverages the strengths of the K-means clustering +method to effectively manage distribution imbalances inherent in UCIR. By +utilizing K-means for generating initial prototypes and approximating class +marginal distributions, we modify the constraints in Optimal Transport +accordingly, significantly enhancing its performance in UCIR scenarios. +Furthermore, we incorporate contrastive learning into the ProtoOT framework to +further improve representation learning. This encourages local semantic +consistency among features with similar semantics, while also explicitly +enforcing separation between features and unmatched prototypes, thereby +enhancing global discriminativeness. ProtoOT surpasses existing +state-of-the-art methods by a notable margin across benchmark datasets. +Notably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 18.17%, +and on Office-Home, it demonstrates a P@15 improvement of 3.83%. + +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: Accepted by WIFS 2024 +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ EAGLE: Towards Efficient Arbitrary Referring Visual Prompts + Comprehension for Multimodal Large Language Models + + +
+ Recently, Multimodal Large Language Models (MLLMs) have sparked great +research interests owing to their exceptional content-reasoning and +instruction-following capabilities. To effectively instruct an MLLM, in +addition to conventional language expressions, the practice of referring to +objects by painting with brushes on images has emerged as a prevalent tool +(referred to as "referring visual prompts") due to its efficacy in aligning the +user's intention with specific image regions. To accommodate the most common +referring visual prompts, namely points, boxes, and masks, existing approaches +initially utilize specialized feature encoding modules to capture the semantics +of the highlighted areas indicated by these prompts. Subsequently, these +encoded region features are adapted to MLLMs through fine-tuning on a +meticulously curated multimodal instruction dataset. However, such designs +suffer from redundancy in architecture. Moreover, they face challenges in +effectively generalizing when encountering a diverse range of arbitrary +referring visual prompts in real-life scenarios. To address the above issues, +we propose EAGLE, a novel MLLM that empowers comprehension of arbitrary +referring visual prompts with less training efforts than existing approaches. +Specifically, our EAGLE maintains the innate format of the referring visual +prompts as colored patches rendered on the given image for conducting the +instruction tuning. Our approach embeds referring visual prompts as spatial +concepts conveying specific spatial areas comprehensible to the MLLM, with the +semantic comprehension of these regions originating from the MLLM itself. +Besides, we also propose a Geometry-Agnostic Learning paradigm (GAL) to further +disentangle the MLLM's region-level comprehension with the specific formats of +referring visual prompts. Extensive experiments are conducted to prove the +effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ GenWarp: Single Image to Novel Views with Semantic-Preserving Generative + Warping NeurIPS 2024 + + +
+ Generating novel views from a single image remains a challenging task due to +the complexity of 3D scenes and the limited diversity in the existing +multi-view datasets to train a model on. Recent research combining large-scale +text-to-image (T2I) models with monocular depth estimation (MDE) has shown +promise in handling in-the-wild images. In these methods, an input view is +geometrically warped to novel views with estimated depth maps, then the warped +image is inpainted by T2I models. However, they struggle with noisy depth maps +and loss of semantic details when warping an input view to novel viewpoints. In +this paper, we propose a novel approach for single-shot novel view synthesis, a +semantic-preserving generative warping framework that enables T2I generative +models to learn where to warp and where to generate, through augmenting +cross-view attention with self-attention. Our approach addresses the +limitations of existing methods by conditioning the generative model on source +view images and incorporating geometric warping signals. Qualitative and +quantitative evaluations demonstrate that our model outperforms existing +methods in both in-domain and out-of-domain scenarios. Project page is +available at https://GenWarp-NVS.github.io/. + +
+
+ comment: Accepted to NeurIPS 2024 / Project page: + https://GenWarp-NVS.github.io +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Masks and Boxes: Combining the Best of Both Worlds for Multi-Object + Tracking + + +
+ Multi-object tracking (MOT) involves identifying and consistently tracking +objects across video sequences. Traditional tracking-by-detection methods, +while effective, often require extensive tuning and lack generalizability. On +the other hand, segmentation mask-based methods are more generic but struggle +with tracking management, making them unsuitable for MOT. We propose a novel +approach, McByte, which incorporates a temporally propagated segmentation mask +as a strong association cue within a tracking-by-detection framework. By +combining bounding box and mask information, McByte enhances robustness and +generalizability without per-sequence tuning. Evaluated on four benchmark +datasets - DanceTrack, MOT17, SoccerNet-tracking 2022, and KITTI-tracking - +McByte demonstrates performance gain in all cases examined. At the same time, +it outperforms existing mask-based methods. Implementation code will be +provided upon acceptance. + +
+
+
+
+
+ + ♻ ☆ HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images + Using Deep Learning + + +
+ The current standard for detecting human epidermal growth factor receptor 2 +(HER2) status in breast cancer patients relies on HER2 amplification, +identified through fluorescence in situ hybridization (FISH) or +immunohistochemistry (IHC). However, hematoxylin and eosin (H\&E) tumor stains +are more widely available, and accurately predicting HER2 status using H\&E +could reduce costs and expedite treatment selection. Deep Learning algorithms +for H&E have shown effectiveness in predicting various cancer features and +clinical outcomes, including moderate success in HER2 status prediction. In +this work, we employed a customized weak supervision classification technique +combined with MoCo-v2 contrastive learning to predict HER2 status. We trained +our pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The +Cancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale +School of Medicine are publicly available. Our pipeline achieved an Area Under +the Curve (AUC) of 0.85 across four different test folds. Additionally, we +tested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2 +score of 2+ and included corresponding HER2 status and FISH test results. These +cases are considered equivocal for IHC, requiring an expensive FISH test on +their IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81 +on these challenging H&E slides. Reducing the need for FISH test can have +significant implications in cancer treatment equity for underserved +populations. + +
+
+
+
+
+ + ♻ ☆ FruitNeRF: A Unified Neural Radiance Field based Fruit Counting + Framework + + +
+ We introduce FruitNeRF, a unified novel fruit counting framework that +leverages state-of-the-art view synthesis methods to count any fruit type +directly in 3D. Our framework takes an unordered set of posed images captured +by a monocular camera and segments fruit in each image. To make our system +independent of the fruit type, we employ a foundation model that generates +binary segmentation masks for any fruit. Utilizing both modalities, RGB and +semantic, we train a semantic neural radiance field. Through uniform volume +sampling of the implicit Fruit Field, we obtain fruit-only point clouds. By +applying cascaded clustering on the extracted point cloud, our approach +achieves precise fruit count.The use of neural radiance fields provides +significant advantages over conventional methods such as object tracking or +optical flow, as the counting itself is lifted into 3D. Our method prevents +double counting fruit and avoids counting irrelevant fruit.We evaluate our +methodology using both real-world and synthetic datasets. The real-world +dataset consists of three apple trees with manually counted ground truths, a +benchmark apple dataset with one row and ground truth fruit location, while the +synthetic dataset comprises various fruit types including apple, plum, lemon, +pear, peach, and mango.Additionally, we assess the performance of fruit +counting using the foundation model compared to a U-Net. + +
+
+ comment: Project Page: https://meyerls.github.io/fruit_nerf/ +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory WACV 2025 + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Deflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ VideoPatchCore: An Effective Method to Memorize Normality for Video + Anomaly Detection ACCV 2024 + + +
+ Video anomaly detection (VAD) is a crucial task in video analysis and +surveillance within computer vision. Currently, VAD is gaining attention with +memory techniques that store the features of normal frames. The stored features +are utilized for frame reconstruction, identifying an abnormality when a +significant difference exists between the reconstructed and input frames. +However, this approach faces several challenges due to the simultaneous +optimization required for both the memory and encoder-decoder model. These +challenges include increased optimization difficulty, complexity of +implementation, and performance variability depending on the memory size. To +address these challenges,we propose an effective memory method for VAD, called +VideoPatchCore. Inspired by PatchCore, our approach introduces a structure that +prioritizes memory optimization and configures three types of memory tailored +to the characteristics of video data. This method effectively addresses the +limitations of existing memory-based methods, achieving good performance +comparable to state-of-the-art methods. Furthermore, our method requires no +training and is straightforward to implement, making VAD tasks more accessible. +Our code is available online at github.com/SkiddieAhn/Paper-VideoPatchCore. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ♻ ☆ AsyncDiff: Parallelizing Diffusion Models by Asynchronous Denoising NeurIPS 2024 + + +
+ Diffusion models have garnered significant interest from the community for +their great generative ability across various applications. However, their +typical multi-step sequential-denoising nature gives rise to high cumulative +latency, thereby precluding the possibilities of parallel computation. To +address this, we introduce AsyncDiff, a universal and plug-and-play +acceleration scheme that enables model parallelism across multiple devices. Our +approach divides the cumbersome noise prediction model into multiple +components, assigning each to a different device. To break the dependency chain +between these components, it transforms the conventional sequential denoising +into an asynchronous process by exploiting the high similarity between hidden +states in consecutive diffusion steps. Consequently, each component is +facilitated to compute in parallel on separate devices. The proposed strategy +significantly reduces inference latency while minimally impacting the +generative quality. Specifically, for the Stable Diffusion v2.1, AsyncDiff +achieves a 2.7x speedup with negligible degradation and a 4.0x speedup with +only a slight reduction of 0.38 in CLIP Score, on four NVIDIA A5000 GPUs. Our +experiments also demonstrate that AsyncDiff can be readily applied to video +diffusion models with encouraging performances. The code is available at +https://github.com/czg1225/AsyncDiff. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SlimSAM: 0.1% Data Makes Segment Anything Slim NeurIPS 2024 + + +
+ Current approaches for compressing the Segment Anything Model (SAM) yield +commendable results, yet necessitate extensive data to train a new network from +scratch. Employing conventional pruning techniques can remarkably reduce data +requirements but would suffer from a degradation in performance. To address +this challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM +compression method that achieves superior performance with extremely less +training data. The essence of SlimSAM is encapsulated in the alternate slimming +framework which effectively enhances knowledge inheritance under severely +limited training data availability and exceptional pruning ratio. Diverging +from prior techniques, our framework progressively compresses the model by +alternately pruning and distilling distinct, decoupled sub-structures. +Disturbed Taylor pruning is also proposed to address the misalignment between +the pruning objective and training target, thereby boosting the +post-distillation after pruning. SlimSAM yields significant performance +improvements while demanding over 10 times less training data than any other +existing compression methods. Even when compared to the original SAM, SlimSAM +achieves approaching performance while reducing parameter counts to merely 1.4% +(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training +data. The code is available at http://github.com/czg1225/SlimSAM. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Text-Guided Single Image Editing for Remote Sensing Images + + +
+ Artificial intelligence generative content (AIGC) has significantly impacted +image generation in the field of remote sensing. However, the equally important +area of remote sensing image (RSI) editing has not received sufficient +attention. Deep learning based editing methods generally involve two sequential +stages: generation and editing. During the generation stage, consistency in +content and details between the original and edited images must be maintained, +while in the editing stage, controllability and accuracy of the edits should be +ensured. For natural images, these challenges can be tackled by training +generative backbones on large-scale benchmark datasets and using text guidance +based on vision-language models (VLMs). However, these previously effective +approaches become less viable for RSIs due to two reasons: First, existing +generative RSI benchmark datasets do not fully capture the diversity of remote +sensing scenarios, particularly in terms of variations in sensors, object +types, and resolutions. Consequently, the generalization capacity of the +trained backbone model is often inadequate for universal editing tasks on RSIs. +Second, the large spatial resolution of RSIs exacerbates the problem in VLMs +where a single text semantic corresponds to multiple image semantics, leading +to the introduction of incorrect semantics when using text to guide RSI +editing. To solve above problems, this paper proposes a text-guided RSI editing +method that is controllable but stable, and can be trained using only a single +image. It adopts a multi-scale training approach to preserve consistency +without the need for training on extensive benchmark datasets, while leveraging +RSI pre-trained VLMs and prompt ensembling (PE) to ensure accuracy and +controllability in the text-guided editing process. + +
+
+ comment: 14 pages, 14 figures, submitted to IEEE Transactions on Geoscience + and Remote Sensing +
+
+
+
+
+ + ♻ ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ♻ ☆ Regional quality estimation for echocardiography using deep learning + + +
+ Automatic estimation of cardiac ultrasound image quality can be beneficial +for guiding operators and ensuring the accuracy of clinical measurements. +Previous work often fails to distinguish the view correctness of the +echocardiogram from the image quality. Additionally, previous studies only +provide a global image quality value, which limits their practical utility. In +this work, we developed and compared three methods to estimate image quality: +1) classic pixel-based metrics like the generalized contrast-to-noise ratio +(gCNR) on myocardial segments as region of interest and left ventricle lumen as +background, obtained using a U-Net segmentation 2) local image coherence +derived from a U-Net model that predicts coherence from B-Mode images 3) a deep +convolutional network that predicts the quality of each region directly in an +end-to-end fashion. We evaluate each method against manual regional image +quality annotations by three experienced cardiologists. The results indicate +poor performance of the gCNR metric, with Spearman correlation to the +annotations of rho = 0.24. The end-to-end learning model obtains the best +result, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63. +Finally, the coherence-based method, with rho = 0.58, outperformed the +classical metrics and is more generic than the end-to-end approach. The image +quality prediction tool is available as an open source Python library at +https://github.com/GillesVanDeVyver/arqee. + +
+
+
+
+
+ + ♻ ☆ High-throughput 3D shape completion of potato tubers on a harvester + + +
+ Potato yield is an important metric for farmers to further optimize their +cultivation practices. Potato yield can be estimated on a harvester using an +RGB-D camera that can estimate the three-dimensional (3D) volume of individual +potato tubers. A challenge, however, is that the 3D shape derived from RGB-D +images is only partially completed, underestimating the actual volume. To +address this issue, we developed a 3D shape completion network, called CoRe++, +which can complete the 3D shape from RGB-D images. CoRe++ is a deep learning +network that consists of a convolutional encoder and a decoder. The encoder +compresses RGB-D images into latent vectors that are used by the decoder to +complete the 3D shape using the deep signed distance field network (DeepSDF). +To evaluate our CoRe++ network, we collected partial and complete 3D point +clouds of 339 potato tubers on an operational harvester in Japan. On the 1425 +RGB-D images in the test set (representing 51 unique potato tubers), our +network achieved a completion accuracy of 2.8 mm on average. For volumetric +estimation, the root mean squared error (RMSE) was 22.6 ml, and this was better +than the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml). +We found that the RMSE can be further reduced to 18.2 ml when performing the 3D +shape completion in the center of the RGB-D image. With an average 3D shape +completion time of 10 milliseconds per tuber, we can conclude that CoRe++ is +both fast and accurate enough to be implemented on an operational harvester for +high-throughput potato yield estimation. Our method can also be applied to +other tuber, fruit and vegetable crops, thereby enabling versatile, accurate +and real-time yield monitoring in precision agriculture. Our code, network +weights and dataset are publicly available at +https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git. + +
+
+ comment: 20 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Open-World Evaluation for Retrieving Diverse Perspectives + + +
+ We study retrieving a set of documents that covers various perspectives on a +complex and contentious question (e.g., will ChatGPT do more harm than good?). +We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), +where each example consists of a question and diverse perspectives associated +with the question, sourced from survey questions and debate websites. On this +data, retrievers paired with a corpus are evaluated to surface a document set +that contains diverse perspectives. Our framing diverges from most retrieval +tasks in that document relevancy cannot be decided by simple string matches to +references. Instead, we build a language model based automatic evaluator that +decides whether each retrieved document contains a perspective. This allows us +to evaluate the performance of three different types of corpus (Wikipedia, web +snapshot, and corpus constructed on the fly with retrieved pages from the +search engine) paired with retrievers. Retrieving diverse documents remains +challenging, with the outputs from existing retrievers covering all +perspectives on only 33.74% of the examples. We further study the impact of +query expansion and diversity-focused reranking approaches and analyze +retriever sycophancy. Together, we lay the foundation for future studies in +retrieval diversity handling complex queries. + +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ Report on the Workshop on Simulations for Information Access (Sim4IA + 2024) at SIGIR 2024 SIGIR + + +
+ This paper is a report of the Workshop on Simulations for Information Access +(Sim4IA) workshop at SIGIR 2024. The workshop had two keynotes, a panel +discussion, nine lightning talks, and two breakout sessions. Key takeaways were +user simulation's importance in academia and industry, the possible bridging of +online and offline evaluation, and the issues of organizing a companion shared +task around user simulations for information access. We report on how we +organized the workshop, provide a brief overview of what happened at the +workshop, and summarize the main topics and findings of the workshop and future +work. + +
+
+ comment: Preprint of a SIGIR Forum submission for Vol. 58 No. 2 - December + 2024 +
+
+
+
+
+ + ☆ Enhancing Tourism Recommender Systems for Sustainable City Trips Using + Retrieval-Augmented Generation RecSys 2024 + + +
+ Tourism Recommender Systems (TRS) have traditionally focused on providing +personalized travel suggestions, often prioritizing user preferences without +considering broader sustainability goals. Integrating sustainability into TRS +has become essential with the increasing need to balance environmental impact, +local community interests, and visitor satisfaction. This paper proposes a +novel approach to enhancing TRS for sustainable city trips using Large Language +Models (LLMs) and a modified Retrieval-Augmented Generation (RAG) pipeline. We +enhance the traditional RAG system by incorporating a sustainability metric +based on a city's popularity and seasonal demand during the prompt augmentation +phase. This modification, called Sustainability Augmented Reranking (SAR), +ensures the system's recommendations align with sustainability goals. +Evaluations using popular open-source LLMs, such as Llama-3.1-Instruct-8B and +Mistral-Instruct-7B, demonstrate that the SAR-enhanced approach consistently +matches or outperforms the baseline (without SAR) across most metrics, +highlighting the benefits of incorporating sustainability into TRS. + +
+
+ comment: Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM + Conference on Recommender Systems (RecSys 2024) +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios RecSys '24 + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Value Identification in Multistakeholder Recommender Systems for + Humanities and Historical Research: The Case of the Digital Archive + Monasterium.net RecSys 2024 + + +
+ Recommender systems remain underutilized in humanities and historical +research, despite their potential to enhance the discovery of cultural records. +This paper offers an initial value identification of the multiple stakeholders +that might be impacted by recommendations in Monasterium.net, a digital archive +for historical legal documents. Specifically, we discuss the diverse values and +objectives of its stakeholders, such as editors, aggregators, platform owners, +researchers, publishers, and funding agencies. These in-depth insights into the +potentially conflicting values of stakeholder groups allow designing and +adapting recommender systems to enhance their usefulness for humanities and +historical research. Additionally, our findings will support deeper engagement +with additional stakeholders to refine value models and evaluation metrics for +recommender systems in the given domains. Our conclusions are embedded in and +applicable to other digital archives and a broader cultural heritage context. + +
+
+ comment: To be presented at: NORMalize 2024: The Second Workshop on the + Normative Design and Evaluation of Recommender Systems, October 18, 2024, + co-located with the ACM Conference on Recommender Systems 2024 (RecSys 2024), + Bari, Italy +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case + Study + + +
+ Extracting meaningful insights from large and complex datasets poses +significant challenges, particularly in ensuring the accuracy and relevance of +retrieved information. Traditional data retrieval methods such as sequential +search and index-based retrieval often fail when handling intricate and +interconnected data structures, resulting in incomplete or misleading outputs. +To overcome these limitations, we introduce Structured-GraphRAG, a versatile +framework designed to enhance information retrieval across structured datasets +in natural language queries. Structured-GraphRAG utilizes multiple knowledge +graphs, which represent data in a structured format and capture complex +relationships between entities, enabling a more nuanced and comprehensive +retrieval of information. This graph-based approach reduces the risk of errors +in language model outputs by grounding responses in a structured format, +thereby enhancing the reliability of results. We demonstrate the effectiveness +of Structured-GraphRAG by comparing its performance with that of a recently +published method using traditional retrieval-augmented generation. Our findings +show that Structured-GraphRAG significantly improves query processing +efficiency and reduces response times. While our case study focuses on soccer +data, the framework's design is broadly applicable, offering a powerful tool +for data analysis and enhancing language model applications across various +structured domains. + +
+
+
+
+
+ + ☆ Improving the Shortest Plank: Vulnerability-Aware Adversarial Training + for Robust Recommender System + + +
+ Recommender systems play a pivotal role in mitigating information overload in +various fields. Nonetheless, the inherent openness of these systems introduces +vulnerabilities, allowing attackers to insert fake users into the system's +training data to skew the exposure of certain items, known as poisoning +attacks. Adversarial training has emerged as a notable defense mechanism +against such poisoning attacks within recommender systems. Existing adversarial +training methods apply perturbations of the same magnitude across all users to +enhance system robustness against attacks. Yet, in reality, we find that +attacks often affect only a subset of users who are vulnerable. These +perturbations of indiscriminate magnitude make it difficult to balance +effective protection for vulnerable users without degrading recommendation +quality for those who are not affected. To address this issue, our research +delves into understanding user vulnerability. Considering that poisoning +attacks pollute the training data, we note that the higher degree to which a +recommender system fits users' training data correlates with an increased +likelihood of users incorporating attack information, indicating their +vulnerability. Leveraging these insights, we introduce the Vulnerability-aware +Adversarial Training (VAT), designed to defend against poisoning attacks in +recommender systems. VAT employs a novel vulnerability-aware function to +estimate users' vulnerability based on the degree to which the system fits +them. Guided by this estimation, VAT applies perturbations of adaptive +magnitude to each user, not only reducing the success ratio of attacks but also +preserving, and potentially enhancing, the quality of recommendations. +Comprehensive experiments confirm VAT's superior defensive capabilities across +different recommendation models and against various types of attacks. + +
+
+
+
+
+ + ☆ Towards More Relevant Product Search Ranking Via Large Language Models: + An Empirical Study CIKM 2024 + + +
+ Training Learning-to-Rank models for e-commerce product search ranking can be +challenging due to the lack of a gold standard of ranking relevance. In this +paper, we decompose ranking relevance into content-based and engagement-based +aspects, and we propose to leverage Large Language Models (LLMs) for both label +and feature generation in model training, primarily aiming to improve the +model's predictive capability for content-based relevance. Additionally, we +introduce different sigmoid transformations on the LLM outputs to polarize +relevance scores in labeling, enhancing the model's ability to balance +content-based and engagement-based relevances and thus prioritize highly +relevant items overall. Comprehensive online tests and offline evaluations are +also conducted for the proposed design. Our work sheds light on advanced +strategies for integrating LLMs into e-commerce product search ranking model +training, offering a pathway to more effective and balanced models with +improved ranking relevance. + +
+
+ comment: To be published in CIKM 2024 GenAIECommerce Workshop +
+
+
+
+
+ + ☆ Long or Short or Both? An Exploration on Lookback Time Windows of + Behavioral Features in Product Search Ranking SIGIR + + +
+ Customer shopping behavioral features are core to product search ranking +models in eCommerce. In this paper, we investigate the effect of lookback time +windows when aggregating these features at the (query, product) level over +history. By studying the pros and cons of using long and short time windows, we +propose a novel approach to integrating these historical behavioral features of +different time windows. In particular, we address the criticality of using +query-level vertical signals in ranking models to effectively aggregate all +information from different behavioral features. Anecdotal evidence for the +proposed approach is also provided using live product search traffic on +Walmart.com. + +
+
+ comment: Published in ACM SIGIR Workshop on eCommerce 2024 +
+
+
+
+
+ + ☆ Minimizing Live Experiments in Recommender Systems: User Simulation to + Evaluate Preference Elicitation Policies + + +
+ Evaluation of policies in recommender systems typically involves A/B testing +using live experiments on real users to assess a new policy's impact on +relevant metrics. This ``gold standard'' comes at a high cost, however, in +terms of cycle time, user cost, and potential user retention. In developing +policies for ``onboarding'' new users, these costs can be especially +problematic, since on-boarding occurs only once. In this work, we describe a +simulation methodology used to augment (and reduce) the use of live +experiments. We illustrate its deployment for the evaluation of ``preference +elicitation'' algorithms used to onboard new users of the YouTube Music +platform. By developing counterfactually robust user behavior models, and a +simulation service that couples such models with production infrastructure, we +are able to test new algorithms in a way that reliably predicts their +performance on key metrics when deployed live. We describe our domain, our +simulation models and platform, results of experiments and deployment, and +suggest future steps needed to further realistic simulation as a powerful +complement to live experiments. + +
+
+
+
+
+ + ♻ ☆ Language agents achieve superhuman synthesis of scientific knowledge + + +
+ Language models are known to hallucinate incorrect information, and it is +unclear if they are sufficiently accurate and reliable for use in scientific +research. We developed a rigorous human-AI comparison methodology to evaluate +language model agents on real-world literature search tasks covering +information retrieval, summarization, and contradiction detection tasks. We +show that PaperQA2, a frontier language model agent optimized for improved +factuality, matches or exceeds subject matter expert performance on three +realistic literature research tasks without any restrictions on humans (i.e., +full access to internet, search tools, and time). PaperQA2 writes cited, +Wikipedia-style summaries of scientific topics that are significantly more +accurate than existing, human-written Wikipedia articles. We also introduce a +hard benchmark for scientific literature research called LitQA2 that guided +design of PaperQA2, leading to it exceeding human performance. Finally, we +apply PaperQA2 to identify contradictions within the scientific literature, an +important scientific task that is challenging for humans. PaperQA2 identifies +2.34 +/- 1.99 contradictions per paper in a random subset of biology papers, of +which 70% are validated by human experts. These results demonstrate that +language model agents are now capable of exceeding domain experts across +meaningful tasks on scientific literature. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ CHIQ: Contextual History Enhancement for Improving Query Rewriting in + Conversational Search EMNLP 2024 + + +
+ In this paper, we study how open-source large language models (LLMs) can be +effectively deployed for improving query rewriting in conversational search, +especially for ambiguous queries. We introduce CHIQ, a two-step method that +leverages the capabilities of LLMs to resolve ambiguities in the conversation +history before query rewriting. This approach contrasts with prior studies that +predominantly use closed-source LLMs to directly generate search queries from +conversation history. We demonstrate on five well-established benchmarks that +CHIQ leads to state-of-the-art results across most settings, showing highly +competitive performances with systems leveraging closed-source LLMs. Our study +provides a first step towards leveraging open-source LLMs in conversational +search, as a competitive alternative to the prevailing reliance on commercial +LLMs. Data, models, and source code will be publicly available upon acceptance +at https://github.com/fengranMark/CHIQ. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ IRSC: A Zero-shot Evaluation Benchmark for Information Retrieval through + Semantic Comprehension in Retrieval-Augmented Generation Scenarios + + +
+ In Retrieval-Augmented Generation (RAG) tasks using Large Language Models +(LLMs), the quality of retrieved information is critical to the final output. +This paper introduces the IRSC benchmark for evaluating the performance of +embedding models in multilingual RAG tasks. The benchmark encompasses five +retrieval tasks: query retrieval, title retrieval, part-of-paragraph retrieval, +keyword retrieval, and summary retrieval. Our research addresses the current +lack of comprehensive testing and effective comparison methods for embedding +models in RAG scenarios. We introduced new metrics: the Similarity of Semantic +Comprehension Index (SSCI) and the Retrieval Capability Contest Index (RCCI), +and evaluated models such as Snowflake-Arctic, BGE, GTE, and M3E. Our +contributions include: 1) the IRSC benchmark, 2) the SSCI and RCCI metrics, and +3) insights into the cross-lingual limitations of embedding models. The IRSC +benchmark aims to enhance the understanding and development of accurate +retrieval systems in RAG tasks. All code and datasets are available at: +https://github.com/Jasaxion/IRSC_Benchmark + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Multi-Domain CTR Prediction via Large Language + Models + + +
+ Click-Through Rate (CTR) prediction is a crucial task in online +recommendation platforms as it involves estimating the probability of user +engagement with advertisements or items by clicking on them. Given the +availability of various services like online shopping, ride-sharing, food +delivery, and professional services on commercial platforms, recommendation +systems in these platforms are required to make CTR predictions across multiple +domains rather than just a single domain. However, multi-domain click-through +rate (MDCTR) prediction remains a challenging task in online recommendation due +to the complex mutual influence between domains. Traditional MDCTR models +typically encode domains as discrete identifiers, ignoring rich semantic +information underlying. Consequently, they can hardly generalize to new +domains. Besides, existing models can be easily dominated by some specific +domains, which results in significant performance drops in the other domains +(i.e. the "seesaw phenomenon"). In this paper, we propose a novel solution +Uni-CTR to address the above challenges. Uni-CTR leverages a backbone Large +Language Model (LLM) to learn layer-wise semantic representations that capture +commonalities between domains. Uni-CTR also uses several domain-specific +networks to capture the characteristics of each domain. Note that we design a +masked loss strategy so that these domain-specific networks are decoupled from +backbone LLM. This allows domain-specific networks to remain unchanged when +incorporating new or removing domains, thereby enhancing the flexibility and +scalability of the system significantly. Experimental results on three public +datasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models +significantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in +zero-shot prediction. We have applied Uni-CTR in industrial scenarios, +confirming its efficiency. + +
+
+ comment: Accept By ACM TRANSACTIONS ON INFORMATION SYSTEMS(TOIS) +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Multi-View and Multi-Scale Alignment for Contrastive Language-Image + Pre-training in Mammography MICCAI 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) shows promise in medical image +analysis but requires substantial data and computational resources. Due to +these restrictions, existing CLIP applications in medical imaging focus mainly +on modalities like chest X-rays that have abundant image-report data available, +leaving many other important modalities under-explored. Here, we propose the +first adaptation of the full CLIP model to mammography, which presents +significant challenges due to labeled data scarcity, high-resolution images +with small regions of interest, and data imbalance. We first develop a +specialized supervision framework for mammography that leverages its multi-view +nature. Furthermore, we design a symmetric local alignment module to better +focus on detailed features in high-resolution images. Lastly, we incorporate a +parameter-efficient fine-tuning approach for large language models pre-trained +with medical knowledge to address data limitations. Our multi-view and +multi-scale alignment (MaMA) method outperforms state-of-the-art baselines for +three different tasks on two large real-world mammography datasets, EMBED and +RSNA-Mammo, with only 52% model size compared with the largest baseline. + +
+
+ comment: This work is also the basis of the overall best solution for the + MICCAI 2024 CXR-LT Challenge +
+
+
+
+
+ + ☆ Find Rhinos without Finding Rhinos: Active Learning with Multimodal + Imagery of South African Rhino Habitats IJCAI 2023 + + +
+ Much of Earth's charismatic megafauna is endangered by human activities, +particularly the rhino, which is at risk of extinction due to the poaching +crisis in Africa. Monitoring rhinos' movement is crucial to their protection +but has unfortunately proven difficult because rhinos are elusive. Therefore, +instead of tracking rhinos, we propose the novel approach of mapping communal +defecation sites, called middens, which give information about rhinos' spatial +behavior valuable to anti-poaching, management, and reintroduction efforts. +This paper provides the first-ever mapping of rhino midden locations by +building classifiers to detect them using remotely sensed thermal, RGB, and +LiDAR imagery in passive and active learning settings. As existing active +learning methods perform poorly due to the extreme class imbalance in our +dataset, we design MultimodAL, an active learning system employing a ranking +technique and multimodality to achieve competitive performance with passive +learning models with 94% fewer labels. Our methods could therefore save over 76 +hours in labeling time when used on a similarly-sized dataset. Unexpectedly, +our midden map reveals that rhino middens are not randomly distributed +throughout the landscape; rather, they are clustered. Consequently, rangers +should be targeted at areas with high midden densities to strengthen +anti-poaching efforts, in line with UN Target 15.7. + +
+
+ comment: 9 pages, 9 figures, IJCAI 2023 Special Track on AI for Good +
+
+
+
+
+ + ☆ MALPOLON: A Framework for Deep Species Distribution Modeling + + +
+ This paper describes a deep-SDM framework, MALPOLON. Written in Python and +built upon the PyTorch library, this framework aims to facilitate training and +inferences of deep species distribution models (deep-SDM) and sharing for users +with only general Python language skills (e.g., modeling ecologists) who are +interested in testing deep learning approaches to build new SDMs. More advanced +users can also benefit from the framework's modularity to run more specific +experiments by overriding existing classes while taking advantage of +press-button examples to train neural networks on multiple classification tasks +using custom or provided raw and pre-processed datasets. The framework is +open-sourced on GitHub and PyPi along with extensive documentation and examples +of use in various scenarios. MALPOLON offers straightforward installation, +YAML-based configuration, parallel computing, multi-GPU utilization, baseline +and foundational models for benchmarking, and extensive +tutorials/documentation, aiming to enhance accessibility and performance +scalability for ecologists and researchers. + +
+
+
+
+
+ + ☆ Self-supervised Pretraining for Cardiovascular Magnetic Resonance Cine + Segmentation MICCAI 2024 + + +
+ Self-supervised pretraining (SSP) has shown promising results in learning +from large unlabeled datasets and, thus, could be useful for automated +cardiovascular magnetic resonance (CMR) short-axis cine segmentation. However, +inconsistent reports of the benefits of SSP for segmentation have made it +difficult to apply SSP to CMR. Therefore, this study aimed to evaluate SSP +methods for CMR cine segmentation. + To this end, short-axis cine stacks of 296 subjects (90618 2D slices) were +used for unlabeled pretraining with four SSP methods; SimCLR, positional +contrastive learning, DINO, and masked image modeling (MIM). Subsets of varying +numbers of subjects were used for supervised fine-tuning of 2D models for each +SSP method, as well as to train a 2D baseline model from scratch. The +fine-tuned models were compared to the baseline using the 3D Dice similarity +coefficient (DSC) in a test dataset of 140 subjects. + The SSP methods showed no performance gains with the largest supervised +fine-tuning subset compared to the baseline (DSC = 0.89). When only 10 subjects +(231 2D slices) are available for supervised training, SSP using MIM (DSC = +0.86) improves over training from scratch (DSC = 0.82). + This study found that SSP is valuable for CMR cine segmentation when labeled +training data is scarce, but does not aid state-of-the-art deep learning +methods when ample labeled data is available. Moreover, the choice of SSP +method is important. The code is publicly available at: +https://github.com/q-cardIA/ssp-cmr-cine-segmentation + +
+
+ comment: Accepted to Data Engineering in Medical Imaging (DEMI) Workshop at + MICCAI 2024 +
+
+
+
+
+ + ☆ Infer Human's Intentions Before Following Natural Language Instructions + + +
+ For AI agents to be helpful to humans, they should be able to follow natural +language instructions to complete everyday cooperative tasks in human +environments. However, real human instructions inherently possess ambiguity, +because the human speakers assume sufficient prior knowledge about their hidden +goals and intentions. Standard language grounding and planning methods fail to +address such ambiguities because they do not model human internal goals as +additional partially observable factors in the environment. We propose a new +framework, Follow Instructions with Social and Embodied Reasoning (FISER), +aiming for better natural language instruction following in collaborative +embodied tasks. Our framework makes explicit inferences about human goals and +intentions as intermediate reasoning steps. We implement a set of +Transformer-based models and evaluate them over a challenging benchmark, +HandMeThat. We empirically demonstrate that using social reasoning to +explicitly infer human intentions before making action plans surpasses purely +end-to-end approaches. We also compare our implementation with strong +baselines, including Chain of Thought prompting on the largest available +pre-trained language models, and find that FISER provides better performance on +the embodied social reasoning tasks under investigation, reaching the +state-of-the-art on HandMeThat. + +
+
+
+
+
+ + ☆ Optimal Protocols for Continual Learning via Statistical Physics and + Control Theory + + +
+ Artificial neural networks often struggle with catastrophic forgetting when +learning multiple tasks sequentially, as training on new tasks degrades the +performance on previously learned ones. Recent theoretical work has addressed +this issue by analysing learning curves in synthetic frameworks under +predefined training protocols. However, these protocols relied on heuristics +and lacked a solid theoretical foundation assessing their optimality. In this +paper, we fill this gap combining exact equations for training dynamics, +derived using statistical physics techniques, with optimal control methods. We +apply this approach to teacher-student models for continual learning and +multi-task problems, obtaining a theory for task-selection protocols maximising +performance while minimising forgetting. Our theoretical analysis offers +non-trivial yet interpretable strategies for mitigating catastrophic +forgetting, shedding light on how optimal learning protocols can modulate +established effects, such as the influence of task similarity on forgetting. +Finally, we validate our theoretical findings on real-world data. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Inverse Reinforcement Learning with Multiple Planning Horizons + + +
+ In this work, we study an inverse reinforcement learning (IRL) problem where +the experts are planning under a shared reward function but with different, +unknown planning horizons. Without the knowledge of discount factors, the +reward function has a larger feasible solution set, which makes it harder for +existing IRL approaches to identify a reward function. To overcome this +challenge, we develop algorithms that can learn a global multi-agent reward +function with agent-specific discount factors that reconstruct the expert +policies. We characterize the feasible solution space of the reward function +and discount factors for both algorithms and demonstrate the generalizability +of the learned reward function across multiple domains. + +
+
+ comment: Accepted at RLC 2024 +
+
+
+
+
+ + ☆ Revisit Anything: Visual Place Recognition via Image Segment Retrieval ECCV 2024 + + +
+ Accurately recognizing a revisited place is crucial for embodied agents to +localize and navigate. This requires visual representations to be distinct, +despite strong variations in camera viewpoint and scene appearance. Existing +visual place recognition pipelines encode the "whole" image and search for +matches. This poses a fundamental challenge in matching two images of the same +place captured from different camera viewpoints: "the similarity of what +overlaps can be dominated by the dissimilarity of what does not overlap". We +address this by encoding and searching for "image segments" instead of the +whole images. We propose to use open-set image segmentation to decompose an +image into `meaningful' entities (i.e., things and stuff). This enables us to +create a novel image representation as a collection of multiple overlapping +subgraphs connecting a segment with its neighboring segments, dubbed +SuperSegment. Furthermore, to efficiently encode these SuperSegments into +compact vector representations, we propose a novel factorized representation of +feature aggregation. We show that retrieving these partial representations +leads to significantly higher recognition recall than the typical whole image +based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new +state-of-the-art in place recognition on a diverse selection of benchmark +datasets, while being applicable to both generic and task-specialized image +encoders. Finally, we demonstrate the potential of our method to ``revisit +anything'' by evaluating our method on an object instance retrieval task, which +bridges the two disparate areas of research: visual place recognition and +object-goal navigation, through their common aim of recognizing goal objects +specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. + +
+
+ comment: Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures +
+
+
+
+
+ + ☆ IFCap: Image-like Retrieval and Frequency-based Entity Filtering for + Zero-shot Captioning EMNLP 2024 + + +
+ Recent advancements in image captioning have explored text-only training +methods to overcome the limitations of paired image-text data. However, +existing text-only training methods often overlook the modality gap between +using text data during training and employing images during inference. To +address this issue, we propose a novel approach called Image-like Retrieval, +which aligns text features with visually relevant features to mitigate the +modality gap. Our method further enhances the accuracy of generated captions by +designing a Fusion Module that integrates retrieved captions with input +features. Additionally, we introduce a Frequency-based Entity Filtering +technique that significantly improves caption quality. We integrate these +methods into a unified framework, which we refer to as IFCap +($\textbf{I}$mage-like Retrieval and $\textbf{F}$requency-based Entity +Filtering for Zero-shot $\textbf{Cap}$tioning). Through extensive +experimentation, our straightforward yet powerful approach has demonstrated its +efficacy, outperforming the state-of-the-art methods by a significant margin in +both image captioning and video captioning compared to zero-shot captioning +based on text-only training. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ FlowBench: A Large Scale Benchmark for Flow Simulation over Complex + Geometries + + +
+ Simulating fluid flow around arbitrary shapes is key to solving various +engineering problems. However, simulating flow physics across complex +geometries remains numerically challenging and computationally +resource-intensive, particularly when using conventional PDE solvers. Machine +learning methods offer attractive opportunities to create fast and adaptable +PDE solvers. However, benchmark datasets to measure the performance of such +methods are scarce, especially for flow physics across complex geometries. We +introduce FlowBench, a dataset for neural simulators with over 10K samples, +which is currently larger than any publicly available flow physics dataset. +FlowBench contains flow simulation data across complex geometries +(\textit{parametric vs. non-parametric}), spanning a range of flow conditions +(\textit{Reynolds number and Grashoff number}), capturing a diverse array of +flow phenomena (\textit{steady vs. transient; forced vs. free convection}), and +for both 2D and 3D. FlowBench contains over 10K data samples, with each sample +the outcome of a fully resolved, direct numerical simulation using a +well-validated simulator framework designed for modeling transport phenomena in +complex geometries. For each sample, we include velocity, pressure, and +temperature field data at 3 different resolutions and several summary +statistics features of engineering relevance (such as coefficients of lift and +drag, and Nusselt numbers). %Additionally, we include masks and signed distance +fields for each shape. We envision that FlowBench will enable evaluating the +interplay between complex geometry, coupled flow phenomena, and data +sufficiency on the performance of current, and future, neural PDE solvers. We +enumerate several evaluation metrics to help rank order the performance of +neural PDE solvers. We benchmark the performance of several baseline methods +including FNO, CNO, WNO, and DeepONet. + +
+
+
+
+
+ + ☆ An Adversarial Perspective on Machine Unlearning for AI Safety + + +
+ Large language models are finetuned to refuse questions about hazardous +knowledge, but these protections can often be bypassed. Unlearning methods aim +at completely removing hazardous capabilities from models and make them +inaccessible to adversaries. This work challenges the fundamental differences +between unlearning and traditional safety post-training from an adversarial +perspective. We demonstrate that existing jailbreak methods, previously +reported as ineffective against unlearning, can be successful when applied +carefully. Furthermore, we develop a variety of adaptive methods that recover +most supposedly unlearned capabilities. For instance, we show that finetuning +on 10 unrelated examples or removing specific directions in the activation +space can recover most hazardous capabilities for models edited with RMU, a +state-of-the-art unlearning method. Our findings challenge the robustness of +current unlearning approaches and question their advantages over safety +training. + +
+
+
+
+
+ + ☆ Spatiotemporal Learning on Cell-embedded Graphs + + +
+ Data-driven simulation of physical systems has recently kindled significant +attention, where many neural models have been developed. In particular, +mesh-based graph neural networks (GNNs) have demonstrated significant potential +in predicting spatiotemporal dynamics across arbitrary geometric domains. +However, the existing node-edge message passing mechanism in GNNs limits the +model's representation learning ability. In this paper, we proposed a +cell-embedded GNN model (aka CeGNN) to learn spatiotemporal dynamics with +lifted performance. Specifically, we introduce a learnable cell attribution to +the node-edge message passing process, which better captures the spatial +dependency of regional features. Such a strategy essentially upgrades the local +aggregation scheme from the first order (e.g., from edge to node) to a higher +order (e.g., from volume to edge and then to node), which takes advantage of +volumetric information in message passing. Meanwhile, a novel feature-enhanced +block is designed to further improve the performance of CeGNN and relieve the +over-smoothness problem, via treating the latent features as basis functions. +The extensive experiments on various PDE systems and one real-world dataset +demonstrate that CeGNN achieves superior performance compared with other +baseline models, particularly reducing the prediction error with up to 1 orders +of magnitude on several PDE systems. + +
+
+
+
+
+ + ☆ Safe Time-Varying Optimization based on Gaussian Processes with + Spatio-Temporal Kernel NeurIPS 2024 + + +
+ Ensuring safety is a key aspect in sequential decision making problems, such +as robotics or process control. The complexity of the underlying systems often +makes finding the optimal decision challenging, especially when the +safety-critical system is time-varying. Overcoming the problem of optimizing an +unknown time-varying reward subject to unknown time-varying safety constraints, +we propose TVSafeOpt, a new algorithm built on Bayesian optimization with a +spatio-temporal kernel. The algorithm is capable of safely tracking a +time-varying safe region without the need for explicit change detection. +Optimality guarantees are also provided for the algorithm when the optimization +problem becomes stationary. We show that TVSafeOpt compares favorably against +SafeOpt on synthetic data, both regarding safety and optimality. Evaluation on +a realistic case study with gas compressors confirms that TVSafeOpt ensures +safety when solving time-varying optimization problems with unknown reward and +safety functions. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless + Imaging NeurIPS 2024 + + +
+ Lensless cameras offer significant advantages in size, weight, and cost +compared to traditional lens-based systems. Without a focusing lens, lensless +cameras rely on computational algorithms to recover the scenes from multiplexed +measurements. However, current algorithms struggle with inaccurate forward +imaging models and insufficient priors to reconstruct high-quality images. To +overcome these limitations, we introduce a novel two-stage approach for +consistent and photorealistic lensless image reconstruction. The first stage of +our approach ensures data consistency by focusing on accurately reconstructing +the low-frequency content with a spatially varying deconvolution method that +adjusts to changes in the Point Spread Function (PSF) across the camera's field +of view. The second stage enhances photorealism by incorporating a generative +prior from pre-trained diffusion models. By conditioning on the low-frequency +content retrieved in the first stage, the diffusion model effectively +reconstructs the high-frequency details that are typically lost in the lensless +imaging process, while also maintaining image fidelity. Our method achieves a +superior balance between data fidelity and visual quality compared to existing +methods, as demonstrated with two popular lensless systems, PhlatCam and +DiffuserCam. Project website: https://phocolens.github.io/. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Joint Localization and Planning using Diffusion ICRA 2025 + + +
+ Diffusion models have been successfully applied to robotics problems such as +manipulation and vehicle path planning. In this work, we explore their +application to end-to-end navigation -- including both perception and planning +-- by considering the problem of jointly performing global localization and +path planning in known but arbitrary 2D environments. In particular, we +introduce a diffusion model which produces collision-free paths in a global +reference frame given an egocentric LIDAR scan, an arbitrary map, and a desired +goal position. To this end, we implement diffusion in the space of paths in +SE(2), and describe how to condition the denoising process on both obstacles +and sensor observations. In our evaluation, we show that the proposed +conditioning techniques enable generalization to realistic maps of considerably +different appearance than the training environment, demonstrate our model's +ability to accurately describe ambiguous solutions, and run extensive +simulation experiments showcasing our model's use as a real-time, end-to-end +localization and planning stack. + +
+
+ comment: 7 pages, 9 figures. Submitted to ICRA 2025, under review +
+
+
+
+
+ + ☆ LoopSR: Looping Sim-and-Real for Lifelong Policy Adaptation of Legged + Robots + + +
+ Reinforcement Learning (RL) has shown its remarkable and generalizable +capability in legged locomotion through sim-to-real transfer. However, while +adaptive methods like domain randomization are expected to make policy more +robust to diverse environments, such comprehensiveness potentially detracts +from the policy's performance in any specific environment according to the No +Free Lunch theorem, leading to a suboptimal solution once deployed in the real +world. To address this issue, we propose a lifelong policy adaptation framework +named LoopSR, which utilizes a transformer-based encoder to project real-world +trajectories into a latent space, and accordingly reconstruct the real-world +environments back in simulation for further improvement. Autoencoder +architecture and contrastive learning methods are adopted to better extract the +characteristics of real-world dynamics. The simulation parameters for continual +training are derived by combining predicted parameters from the decoder with +retrieved parameters from the simulation trajectory dataset. By leveraging the +continual training, LoopSR achieves superior data efficiency compared with +strong baselines, with only a limited amount of data to yield eminent +performance in both sim-to-sim and sim-to-real experiments. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Dimension-independent learning rates for high-dimensional classification + problems + + +
+ We study the problem of approximating and estimating classification functions +that have their decision boundary in the $RBV^2$ space. Functions of $RBV^2$ +type arise naturally as solutions of regularized neural network learning +problems and neural networks can approximate these functions without the curse +of dimensionality. We modify existing results to show that every $RBV^2$ +function can be approximated by a neural network with bounded weights. +Thereafter, we prove the existence of a neural network with bounded weights +approximating a classification function. And we leverage these bounds to +quantify the estimation rates. Finally, we present a numerical study that +analyzes the effect of different regularity conditions on the decision +boundaries. + +
+
+
+
+
+ + ☆ Supra-Laplacian Encoding for Transformer on Dynamic Graphs + + +
+ Fully connected Graph Transformers (GT) have rapidly become prominent in the +static graph community as an alternative to Message-Passing models, which +suffer from a lack of expressivity, oversquashing, and under-reaching. However, +in a dynamic context, by interconnecting all nodes at multiple snapshots with +self-attention, GT loose both structural and temporal information. In this +work, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs +(SLATE), a new spatio-temporal encoding to leverage the GT architecture while +keeping spatio-temporal information. Specifically, we transform Discrete Time +Dynamic Graphs into multi-layer graphs and take advantage of the spectral +properties of their associated supra-Laplacian matrix. Our second contribution +explicitly model nodes' pairwise relationships with a cross-attention +mechanism, providing an accurate edge representation for dynamic link +prediction. SLATE outperforms numerous state-of-the-art methods based on +Message-Passing Graph Neural Networks combined with recurrent models (e.g +LSTM), and Dynamic Graph Transformers, on 9 datasets. Code and instructions to +reproduce our results will be open-sourced. + +
+
+
+
+
+ + ☆ Hypergame Theory for Decentralized Resource Allocation in Multi-user + Semantic Communications + + +
+ Semantic communications (SC) is an emerging communication paradigm in which +wireless devices can send only relevant information from a source of data while +relying on computing resources to regenerate missing data points. However, the +design of a multi-user SC system becomes more challenging because of the +computing and communication overhead required for coordination. Existing +solutions for learning the semantic language and performing resource allocation +often fail to capture the computing and communication tradeoffs involved in +multiuser SC. To address this gap, a novel framework for decentralized +computing and communication resource allocation in multiuser SC systems is +proposed. The challenge of efficiently allocating communication and computing +resources (for reasoning) in a decentralized manner to maximize the quality of +task experience for the end users is addressed through the application of +Stackelberg hyper game theory. Leveraging the concept of second-level hyper +games, novel analytical formulations are developed to model misperceptions of +the users about each other's communication and control strategies. Further, +equilibrium analysis of the learned resource allocation protocols examines the +convergence of the computing and communication strategies to a local +Stackelberg equilibria, considering misperceptions. Simulation results show +that the proposed Stackelberg hyper game results in efficient usage of +communication and computing resources while maintaining a high quality of +experience for the users compared to state-of-the-art that does not account for +the misperceptions. + +
+
+
+
+
+ + ☆ HydraViT: Stacking Heads for a Scalable ViT + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+
+
+
+ + ☆ BEATS: Optimizing LLM Mathematical Capabilities with BackVerify and + Adaptive Disambiguate based Efficient Tree Search + + +
+ Large Language Models (LLMs) have exhibited exceptional performance across a +broad range of tasks and domains. However, they still encounter difficulties in +solving mathematical problems due to the rigorous and logical nature of +mathematics. Previous studies have employed techniques such as supervised +fine-tuning (SFT), prompt engineering, and search-based methods to improve the +mathematical problem-solving abilities of LLMs. Despite these efforts, their +performance remains suboptimal and demands substantial computational resources. +To address this issue, we propose a novel approach, BEATS, to enhance +mathematical problem-solving abilities. Our method leverages newly designed +prompts that guide the model to iteratively rewrite, advance by one step, and +generate answers based on previous steps. Additionally, we introduce a new +back-verification technique that uses LLMs to validate the correctness of the +generated answers. Furthermore, we employ a pruning tree search to optimize +search time while achieving strong performance. Notably, our method improves +Qwen2-7b-Instruct's score from 36.94 to 61.52, outperforming GPT4's 42.5 on the +MATH benchmark. + +
+
+
+
+
+ + ☆ On Translating Technical Terminology: A Translation Workflow for + Machine-Translated Acronyms + + +
+ The typical workflow for a professional translator to translate a document +from its source language (SL) to a target language (TL) is not always focused +on what many language models in natural language processing (NLP) do - predict +the next word in a series of words. While high-resource languages like English +and French are reported to achieve near human parity using common metrics for +measurement such as BLEU and COMET, we find that an important step is being +missed: the translation of technical terms, specifically acronyms. Some +state-of-the art machine translation systems like Google Translate which are +publicly available can be erroneous when dealing with acronyms - as much as 50% +in our findings. This article addresses acronym disambiguation for MT systems +by proposing an additional step to the SL-TL (FR-EN) translation workflow where +we first offer a new acronym corpus for public consumption and then experiment +with a search-based thresholding algorithm that achieves nearly 10% increase +when compared to Google Translate and OpusMT. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Predicting Anchored Text from Translation Memories for Machine + Translation Using Deep Learning Methods + + +
+ Translation memories (TMs) are the backbone for professional translation +tools called computer-aided translation (CAT) tools. In order to perform a +translation using a CAT tool, a translator uses the TM to gather translations +similar to the desired segment to translate (s'). Many CAT tools offer a +fuzzy-match algorithm to locate segments (s) in the TM that are close in +distance to s'. After locating two similar segments, the CAT tool will present +parallel segments (s, t) that contain one segment in the source language along +with its translation in the target language. Additionally, CAT tools contain +fuzzy-match repair (FMR) techniques that will automatically use the parallel +segments from the TM to create new TM entries containing a modified version of +the original with the idea in mind that it will be the translation of s'. Most +FMR techniques use machine translation as a way of "repairing" those words that +have to be modified. In this article, we show that for a large part of those +words which are anchored, we can use other techniques that are based on machine +learning approaches such as Word2Vec. BERT, and even ChatGPT. Specifically, we +show that for anchored words that follow the continuous bag-of-words (CBOW) +paradigm, Word2Vec, BERT, and GPT-4 can be used to achieve similar and, for +some cases, better results than neural machine translation for translating +anchored words from French to English. + +
+
+ comment: AMTA 2024 - The Association for Machine Translation in the Americas + organizes biennial conferences devoted to researchers, commercial users, + governmental and NGO users +
+
+
+
+
+ + ☆ Adaptive Stream Processing on Edge Devices through Active Inference + + +
+ The current scenario of IoT is witnessing a constant increase on the volume +of data, which is generated in constant stream, calling for novel architectural +and logical solutions for processing it. Moving the data handling towards the +edge of the computing spectrum guarantees better distribution of load and, in +principle, lower latency and better privacy. However, managing such a structure +is complex, especially when requirements, also referred to Service Level +Objectives (SLOs), specified by applications' owners and infrastructure +managers need to be ensured. Despite the rich number of proposals of Machine +Learning (ML) based management solutions, researchers and practitioners yet +struggle to guarantee long-term prediction and control, and accurate +troubleshooting. Therefore, we present a novel ML paradigm based on Active +Inference (AIF) -- a concept from neuroscience that describes how the brain +constantly predicts and evaluates sensory information to decrease long-term +surprise. We implement it and evaluate it in a heterogeneous real stream +processing use case, where an AIF-based agent continuously optimizes the +fulfillment of three SLOs for three autonomous driving services running on +multiple devices. The agent used causal knowledge to gradually develop an +understanding of how its actions are related to requirements fulfillment, and +which configurations to favor. Through this approach, our agent requires up to +thirty iterations to converge to the optimal solution, showing the capability +of offering accurate results in a short amount of time. Furthermore, thanks to +AIF and its causal structures, our method guarantees full transparency on the +decision making, making the interpretation of the results and the +troubleshooting effortless. + +
+
+
+
+
+ + ☆ Sample compression unleashed : New generalization bounds for real valued + losses + + +
+ The sample compression theory provides generalization guarantees for +predictors that can be fully defined using a subset of the training dataset and +a (short) message string, generally defined as a binary sequence. Previous +works provided generalization bounds for the zero-one loss, which is +restrictive, notably when applied to deep learning approaches. In this paper, +we present a general framework for deriving new sample compression bounds that +hold for real-valued losses. We empirically demonstrate the tightness of the +bounds and their versatility by evaluating them on different types of models, +e.g., neural networks and decision forests, trained with the Pick-To-Learn +(P2L) meta-algorithm, which transforms the training method of any +machine-learning predictor to yield sample-compressed predictors. In contrast +to existing P2L bounds, ours are valid in the non-consistent case. + +
+
+
+
+
+ + ☆ Intelligent Energy Management: Remaining Useful Life Prediction and + Charging Automation System Comprised of Deep Learning and the Internet of + Things + + +
+ Remaining Useful Life (RUL) of battery is an important parameter to know the +battery's remaining life and need for recharge. The goal of this research +project is to develop machine learning-based models for the battery RUL +dataset. Different ML models are developed to classify the RUL of the vehicle, +and the IoT (Internet of Things) concept is simulated for automating the +charging system and managing any faults aligning. The graphs plotted depict the +relationship between various vehicle parameters using the Blynk IoT platform. +Results show that the catboost, Multi-Layer Perceptron (MLP), Gated Recurrent +Unit (GRU), and hybrid model developed could classify RUL into three classes +with 99% more accuracy. The data is fed using the tkinter GUI for simulating +artificial intelligence (AI)-based charging, and with a pyserial backend, data +can be entered into the Esp-32 microcontroller for making charge discharge +possible with the model's predictions. Also, with an IoT system, the charging +can be disconnected, monitored, and analyzed for automation. The results show +that an accuracy of 99% can be obtained on models MLP, catboost model and +similar accuracy on GRU model can be obtained, and finally relay-based +triggering can be made by prediction through the model used for automating the +charging and energy-saving mechanism. By showcasing an exemplary Blynk +platform-based monitoring and automation phenomenon, we further present +innovative ways of monitoring parameters and automating the system. + +
+
+
+
+
+ + ☆ Graph Reasoning with Large Language Models via Pseudo-code Prompting + + +
+ Large language models (LLMs) have recently achieved remarkable success in +various reasoning tasks in the field of natural language processing. This +success of LLMs has also motivated their use in graph-related tasks. Among +others, recent work has explored whether LLMs can solve graph problems such as +counting the number of connected components of a graph or computing the +shortest path distance between two nodes. Although LLMs possess preliminary +graph reasoning abilities, they might still struggle to solve some seemingly +simple problems. In this paper, we investigate whether prompting via +pseudo-code instructions can improve the performance of LLMs in solving graph +problems. Our experiments demonstrate that using pseudo-code instructions +generally improves the performance of all considered LLMs. The graphs, +pseudo-code prompts, and evaluation code are publicly available. + +
+
+
+
+
+ + ☆ Designing Short-Stage CDC-XPUFs: Balancing Reliability, Cost, and + Security in IoT Devices + + +
+ The rapid expansion of Internet of Things (IoT) devices demands robust and +resource-efficient security solutions. Physically Unclonable Functions (PUFs), +which generate unique cryptographic keys from inherent hardware variations, +offer a promising approach. However, traditional PUFs like Arbiter PUFs (APUFs) +and XOR Arbiter PUFs (XOR-PUFs) are susceptible to machine learning (ML) and +reliability-based attacks. In this study, we investigate +Component-Differentially Challenged XOR-PUFs (CDC-XPUFs), a less explored +variant, to address these vulnerabilities. We propose an optimized CDC-XPUF +design that incorporates a pre-selection strategy to enhance reliability and +introduces a novel lightweight architecture to reduce hardware overhead. +Rigorous testing demonstrates that our design significantly lowers resource +consumption, maintains strong resistance to ML attacks, and improves +reliability, effectively mitigating reliability-based attacks. These results +highlight the potential of CDC-XPUFs as a secure and efficient candidate for +widespread deployment in resource-constrained IoT systems. + +
+
+
+
+
+ + ☆ Model-Free versus Model-Based Reinforcement Learning for Fixed-Wing UAV + Attitude Control Under Varying Wind Conditions + + +
+ This paper evaluates and compares the performance of model-free and +model-based reinforcement learning for the attitude control of fixed-wing +unmanned aerial vehicles using PID as a reference point. The comparison focuses +on their ability to handle varying flight dynamics and wind disturbances in a +simulated environment. Our results show that the Temporal Difference Model +Predictive Control agent outperforms both the PID controller and other +model-free reinforcement learning methods in terms of tracking accuracy and +robustness over different reference difficulties, particularly in nonlinear +flight regimes. Furthermore, we introduce actuation fluctuation as a key metric +to assess energy efficiency and actuator wear, and we test two different +approaches from the literature: action variation penalty and conditioning for +action policy smoothness. We also evaluate all control methods when subject to +stochastic turbulence and gusts separately, so as to measure their effects on +tracking performance, observe their limitations and outline their implications +on the Markov decision process formalism. + +
+
+ comment: Published at ICINCO 2024 +
+
+
+
+
+ + ☆ A multi-source data power load forecasting method using attention + mechanism-based parallel cnn-gru + + +
+ Accurate power load forecasting is crucial for improving energy efficiency +and ensuring power supply quality. Considering the power load forecasting +problem involves not only dynamic factors like historical load variations but +also static factors such as climate conditions that remain constant over +specific periods. From the model-agnostic perspective, this paper proposes a +parallel structure network to extract important information from both dynamic +and static data. Firstly, based on complexity learning theory, it is +demonstrated that models integrated through parallel structures exhibit +superior generalization abilities compared to individual base learners. +Additionally, the higher the independence between base learners, the stronger +the generalization ability of the parallel structure model. This suggests that +the structure of machine learning models inherently contains significant +information. Building on this theoretical foundation, a parallel convolutional +neural network (CNN)-gate recurrent unit (GRU) attention model (PCGA) is +employed to address the power load forecasting issue, aiming to effectively +integrate the influences of dynamic and static features. The CNN module is +responsible for capturing spatial characteristics from static data, while the +GRU module captures long-term dependencies in dynamic time series data. The +attention layer is designed to focus on key information from the +spatial-temporal features extracted by the parallel CNN-GRU. To substantiate +the advantages of the parallel structure model in extracting and integrating +multi-source information, a series of experiments are conducted. + +
+
+
+
+
+ + ☆ A method for identifying causality in the response of nonlinear + dynamical systems + + +
+ Predicting the response of nonlinear dynamical systems subject to random, +broadband excitation is important across a range of scientific disciplines, +such as structural dynamics and neuroscience. Building data-driven models +requires experimental measurements of the system input and output, but it can +be difficult to determine whether inaccuracies in the model stem from modelling +errors or noise. This paper presents a novel method to identify the causal +component of the input-output data from measurements of a system in the +presence of output noise, as a function of frequency, without needing a high +fidelity model. An output prediction, calculated using an available model, is +optimally combined with noisy measurements of the output to predict the input +to the system. The parameters of the algorithm balance the two output signals +and are utilised to calculate a nonlinear coherence metric as a measure of +causality. This method is applicable to a broad class of nonlinear dynamical +systems. There are currently no solutions to this problem in the absence of a +complete benchmark model. + +
+
+
+
+
+ + ☆ Efficient Arbitrary Precision Acceleration for Large Language Models on + GPU Tensor Cores + + +
+ Large language models (LLMs) have been widely applied but face challenges in +efficient inference. While quantization methods reduce computational demands, +ultra-low bit quantization with arbitrary precision is hindered by limited GPU +Tensor Core support and inefficient memory management, leading to suboptimal +acceleration. To address these challenges, we propose a comprehensive +acceleration scheme for arbitrary precision LLMs. At its core, we introduce a +novel bipolar-INT data format that facilitates parallel computing and supports +symmetric quantization, effectively reducing data redundancy. Building on this, +we implement an arbitrary precision matrix multiplication scheme that +decomposes and recovers matrices at the bit level, enabling flexible precision +while maximizing GPU Tensor Core utilization. Furthermore, we develop an +efficient matrix preprocessing method that optimizes data layout for subsequent +computations. Finally, we design a data recovery-oriented memory management +system that strategically utilizes fast shared memory, significantly enhancing +kernel execution speed and minimizing memory access latency. Experimental +results demonstrate our approach's effectiveness, with up to 13\times speedup +in matrix multiplication compared to NVIDIA's CUTLASS. When integrated into +LLMs, we achieve up to 6.7\times inference acceleration. These improvements +significantly enhance LLM inference efficiency, enabling broader and more +responsive applications of LLMs. + +
+
+
+
+
+ + ☆ Implementing a Nordic-Baltic Federated Health Data Network: a case + report + + +
+ Background: Centralized collection and processing of healthcare data across +national borders pose significant challenges, including privacy concerns, data +heterogeneity and legal barriers. To address some of these challenges, we +formed an interdisciplinary consortium to develop a feder-ated health data +network, comprised of six institutions across five countries, to facilitate +Nordic-Baltic cooperation on secondary use of health data. The objective of +this report is to offer early insights into our experiences developing this +network. Methods: We used a mixed-method ap-proach, combining both experimental +design and implementation science to evaluate the factors affecting the +implementation of our network. Results: Technically, our experiments indicate +that the network functions without significant performance degradation compared +to centralized simu-lation. Conclusion: While use of interdisciplinary +approaches holds a potential to solve challeng-es associated with establishing +such collaborative networks, our findings turn the spotlight on the uncertain +regulatory landscape playing catch up and the significant operational costs. + +
+
+ comment: 24 pages (including appendices), 1 figure +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios RecSys '24 + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ How Feature Learning Can Improve Neural Scaling Laws + + +
+ We develop a solvable model of neural scaling laws beyond the kernel limit. +Theoretical analysis of this model shows how performance scales with model +size, training time, and the total amount of available data. We identify three +scaling regimes corresponding to varying task difficulties: hard, easy, and +super easy tasks. For easy and super-easy target functions, which lie in the +reproducing kernel Hilbert space (RKHS) defined by the initial infinite-width +Neural Tangent Kernel (NTK), the scaling exponents remain unchanged between +feature learning and kernel regime models. For hard tasks, defined as those +outside the RKHS of the initial NTK, we demonstrate both analytically and +empirically that feature learning can improve scaling with training time and +compute, nearly doubling the exponent for hard tasks. This leads to a different +compute optimal strategy to scale parameters and training time in the feature +learning regime. We support our finding that feature learning improves the +scaling law for hard tasks but not for easy and super-easy tasks with +experiments of nonlinear MLPs fitting functions with power-law Fourier spectra +on the circle and CNNs learning vision tasks. + +
+
+
+
+
+ + ☆ AMARO: All Heavy-Atom Transferable Neural Network Potentials of Protein + Thermodynamics + + +
+ All-atom molecular simulations offer detailed insights into macromolecular +phenomena, but their substantial computational cost hinders the exploration of +complex biological processes. We introduce Advanced Machine-learning Atomic +Representation Omni-force-field (AMARO), a new neural network potential (NNP) +that combines an O(3)-equivariant message-passing neural network architecture, +TensorNet, with a coarse-graining map that excludes hydrogen atoms. AMARO +demonstrates the feasibility of training coarser NNP, without prior energy +terms, to run stable protein dynamics with scalability and generalization +capabilities. + +
+
+
+
+
+ + ☆ Machine Learning-based vs Deep Learning-based Anomaly Detection in + Multivariate Time Series for Spacecraft Attitude Sensors SP + + +
+ In the framework of Failure Detection, Isolation and Recovery (FDIR) on +spacecraft, new AI-based approaches are emerging in the state of the art to +overcome the limitations commonly imposed by traditional threshold checking. + The present research aims at characterizing two different approaches to the +problem of stuck values detection in multivariate time series coming from +spacecraft attitude sensors. The analysis reveals the performance differences +in the two approaches, while commenting on their interpretability and +generalization to different scenarios. + +
+
+ comment: Accepted for the ESA SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Language Models as Zero-shot Lossless Gradient Compressors: Towards + General Neural Parameter Prior Models NeurIPS 2024 + + +
+ Despite the widespread use of statistical prior models in various fields, +such models for neural network gradients have long been overlooked. The +inherent challenge stems from their high-dimensional structures and complex +interdependencies, which complicate effective modeling. In this work, we +demonstrate the potential of large language models (LLMs) to act as gradient +priors in a zero-shot setting. We examine the property by considering lossless +gradient compression -- a critical application in distributed learning -- that +depends heavily on precise probability modeling. To achieve this, we introduce +LM-GC, a novel method that integrates LLMs with arithmetic coding. Our +technique converts plain gradients into text-like formats, enhancing token +efficiency by up to 38 times compared to their plain representations. We ensure +that this data conversion maintains a close alignment with the structure of +plain gradients and the symbols commonly recognized by LLMs. Our experiments +indicate that LM-GC surpasses existing state-of-the-art lossless compression +methods, improving compression rates by 10\% up to 17.2\% across various +datasets and architectures. Additionally, our approach shows promising +compatibility with lossy compression techniques such as quantization and +sparsification. These findings highlight the significant potential of LLMs as a +model for effectively handling gradients. We will release the source code upon +publication. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ Ordinary Differential Equations for Enhanced 12-Lead ECG Generation + + +
+ In the realm of artificial intelligence, the generation of realistic training +data for supervised learning tasks presents a significant challenge. This is +particularly true in the synthesis of electrocardiograms (ECGs), where the +objective is to develop a synthetic 12-lead ECG model. The primary complexity +of this task stems from accurately modeling the intricate biological and +physiological interactions among different ECG leads. Although mathematical +process simulators have shed light on these dynamics, effectively incorporating +this understanding into generative models is not straightforward. In this work, +we introduce an innovative method that employs ordinary differential equations +(ODEs) to enhance the fidelity of generating 12-lead ECG data. This approach +integrates a system of ODEs that represent cardiac dynamics directly into the +generative model's optimization process, allowing for the production of +biologically plausible ECG training data that authentically reflects real-world +variability and inter-lead dependencies. We conducted an empirical analysis of +thousands of ECGs and found that incorporating cardiac simulation insights into +the data generation process significantly improves the accuracy of heart +abnormality classifiers trained on this synthetic 12-lead ECG data. + +
+
+
+
+
+ + ☆ Physics-aligned Schrödinger bridge + + +
+ The reconstruction of physical fields from sparse measurements is pivotal in +both scientific research and engineering applications. Traditional methods are +increasingly supplemented by deep learning models due to their efficacy in +extracting features from data. However, except for the low accuracy on complex +physical systems, these models often fail to comply with essential physical +constraints, such as governing equations and boundary conditions. To overcome +this limitation, we introduce a novel data-driven field reconstruction +framework, termed the Physics-aligned Schr\"{o}dinger Bridge (PalSB). This +framework leverages a diffusion Schr\"{o}dinger bridge mechanism that is +specifically tailored to align with physical constraints. The PalSB approach +incorporates a dual-stage training process designed to address both local +reconstruction mapping and global physical principles. Additionally, a +boundary-aware sampling technique is implemented to ensure adherence to +physical boundary conditions. We demonstrate the effectiveness of PalSB through +its application to three complex nonlinear systems: cylinder flow from Particle +Image Velocimetry experiments, two-dimensional turbulence, and a +reaction-diffusion system. The results reveal that PalSB not only achieves +higher accuracy but also exhibits enhanced compliance with physical constraints +compared to existing methods. This highlights PalSB's capability to generate +high-quality representations of intricate physical interactions, showcasing its +potential for advancing field reconstruction techniques. + +
+
+
+
+
+ + ☆ Generative Modeling of Molecular Dynamics Trajectories NeurIPS 2024 + + +
+ Molecular dynamics (MD) is a powerful technique for studying microscopic +phenomena, but its computational cost has driven significant interest in the +development of deep learning-based surrogate models. We introduce generative +modeling of molecular trajectories as a paradigm for learning flexible +multi-task surrogate models of MD from data. By conditioning on appropriately +chosen frames of the trajectory, we show such generative models can be adapted +to diverse tasks such as forward simulation, transition path sampling, and +trajectory upsampling. By alternatively conditioning on part of the molecular +system and inpainting the rest, we also demonstrate the first steps towards +dynamics-conditioned molecular design. We validate the full set of these +capabilities on tetrapeptide simulations and show that our model can produce +reasonable ensembles of protein monomers. Altogether, our work illustrates how +generative modeling can unlock value from MD data towards diverse downstream +tasks that are not straightforward to address with existing methods or even MD +itself. Code is available at https://github.com/bjing2016/mdgen. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Continual learning with task specialist + + +
+ Continual learning (CL) adapt the deep learning scenarios with timely updated +datasets. However, existing CL models suffer from the catastrophic forgetting +issue, where new knowledge replaces past learning. In this paper, we propose +Continual Learning with Task Specialists (CLTS) to address the issues of +catastrophic forgetting and limited labelled data in real-world datasets by +performing class incremental learning of the incoming stream of data. The model +consists of Task Specialists (T S) and Task Predictor (T P ) with pre-trained +Stable Diffusion (SD) module. Here, we introduce a new specialist to handle a +new task sequence and each T S has three blocks; i) a variational autoencoder +(V AE) to learn the task distribution in a low dimensional latent space, ii) a +K-Means block to perform data clustering and iii) Bootstrapping Language-Image +Pre-training (BLIP ) model to generate a small batch of captions from the input +data. These captions are fed as input to the pre-trained stable diffusion model +(SD) for the generation of task samples. The proposed model does not store any +task samples for replay, instead uses generated samples from SD to train the T +P module. A comparison study with four SOTA models conducted on three +real-world datasets shows that the proposed model outperforms all the selected +baselines + +
+
+
+
+
+ + ☆ Enriched Functional Tree-Based Classifiers: A Novel Approach Leveraging + Derivatives and Geometric Features + + +
+ The positioning of this research falls within the scalar-on-function +classification literature, a field of significant interest across various +domains, particularly in statistics, mathematics, and computer science. This +study introduces an advanced methodology for supervised classification by +integrating Functional Data Analysis (FDA) with tree-based ensemble techniques +for classifying high-dimensional time series. The proposed framework, Enriched +Functional Tree-Based Classifiers (EFTCs), leverages derivative and geometric +features, benefiting from the diversity inherent in ensemble methods to further +enhance predictive performance and reduce variance. While our approach has been +tested on the enrichment of Functional Classification Trees (FCTs), Functional +K-NN (FKNN), Functional Random Forest (FRF), Functional XGBoost (FXGB), and +Functional LightGBM (FLGBM), it could be extended to other tree-based and +non-tree-based classifiers, with appropriate considerations emerging from this +investigation. Through extensive experimental evaluations on seven real-world +datasets and six simulated scenarios, this proposal demonstrates fascinating +improvements over traditional approaches, providing new insights into the +application of FDA in complex, high-dimensional learning problems. + +
+
+
+
+
+ + ☆ CASPFormer: Trajectory Prediction from BEV Images with Deformable + Attention ICPR 2024 + + +
+ Motion prediction is an important aspect for Autonomous Driving (AD) and +Advance Driver Assistance Systems (ADAS). Current state-of-the-art motion +prediction methods rely on High Definition (HD) maps for capturing the +surrounding context of the ego vehicle. Such systems lack scalability in +real-world deployment as HD maps are expensive to produce and update in +real-time. To overcome this issue, we propose Context Aware Scene Prediction +Transformer (CASPFormer), which can perform multi-modal motion prediction from +rasterized Bird-Eye-View (BEV) images. Our system can be integrated with any +upstream perception module that is capable of generating BEV images. Moreover, +CASPFormer directly decodes vectorized trajectories without any postprocessing. +Trajectories are decoded recurrently using deformable attention, as it is +computationally efficient and provides the network with the ability to focus +its attention on the important spatial locations of the BEV images. In +addition, we also address the issue of mode collapse for generating multiple +scene-consistent trajectories by incorporating learnable mode queries. We +evaluate our model on the nuScenes dataset and show that it reaches +state-of-the-art across multiple metrics + +
+
+ comment: Under Review at ICPR 2024, Kolkata +
+
+
+
+
+ + ☆ Predicting the Stay Length of Patients in Hospitals using Convolutional + Gated Recurrent Deep Learning Model + + +
+ Predicting hospital length of stay (LoS) stands as a critical factor in +shaping public health strategies. This data serves as a cornerstone for +governments to discern trends, patterns, and avenues for enhancing healthcare +delivery. In this study, we introduce a robust hybrid deep learning model, a +combination of Multi-layer Convolutional (CNNs) deep learning, Gated Recurrent +Units (GRU), and Dense neural networks, that outperforms 11 conventional and +state-of-the-art Machine Learning (ML) and Deep Learning (DL) methodologies in +accurately forecasting inpatient hospital stay duration. Our investigation +delves into the implementation of this hybrid model, scrutinising variables +like geographic indicators tied to caregiving institutions, demographic markers +encompassing patient ethnicity, race, and age, as well as medical attributes +such as the CCS diagnosis code, APR DRG code, illness severity metrics, and +hospital stay duration. Statistical evaluations reveal the pinnacle LoS +accuracy achieved by our proposed model (CNN-GRU-DNN), which averages at 89% +across a 10-fold cross-validation test, surpassing LSTM, BiLSTM, GRU, and +Convolutional Neural Networks (CNNs) by 19%, 18.2%, 18.6%, and 7%, +respectively. Accurate LoS predictions not only empower hospitals to optimise +resource allocation and curb expenses associated with prolonged stays but also +pave the way for novel strategies in hospital stay management. This avenue +holds promise for catalysing advancements in healthcare research and +innovation, inspiring a new era of precision-driven healthcare practices. + +
+
+
+
+
+ + ☆ Confidence intervals uncovered: Are we ready for real-world medical + imaging AI? MICCAI 2024 + + +
+ Medical imaging is spearheading the AI transformation of healthcare. +Performance reporting is key to determine which methods should be translated +into clinical practice. Frequently, broad conclusions are simply derived from +mean performance values. In this paper, we argue that this common practice is +often a misleading simplification as it ignores performance variability. Our +contribution is threefold. (1) Analyzing all MICCAI segmentation papers (n = +221) published in 2023, we first observe that more than 50\% of papers do not +assess performance variability at all. Moreover, only one (0.5\%) paper +reported confidence intervals (CIs) for model performance. (2) To address the +reporting bottleneck, we show that the unreported standard deviation (SD) in +segmentation papers can be approximated by a second-order polynomial function +of the mean Dice similarity coefficient (DSC). Based on external validation +data from 56 previous MICCAI challenges, we demonstrate that this approximation +can accurately reconstruct the CI of a method using information provided in +publications. (3) Finally, we reconstructed 95\% CIs around the mean DSC of +MICCAI 2023 segmentation papers. The median CI width was 0.03 which is three +times larger than the median performance gap between the first and second +ranked method. For more than 60\% of papers, the mean performance of the +second-ranked method was within the CI of the first-ranked method. We conclude +that current publications typically do not provide sufficient evidence to +support which models could potentially be translated into clinical practice. + +
+
+ comment: Paper accepted at MICCAI 2024 conference +
+
+
+
+
+ + ☆ Byzantine-Robust Aggregation for Securing Decentralized Federated + Learning + + +
+ Federated Learning (FL) emerges as a distributed machine learning approach +that addresses privacy concerns by training AI models locally on devices. +Decentralized Federated Learning (DFL) extends the FL paradigm by eliminating +the central server, thereby enhancing scalability and robustness through the +avoidance of a single point of failure. However, DFL faces significant +challenges in optimizing security, as most Byzantine-robust algorithms proposed +in the literature are designed for centralized scenarios. In this paper, we +present a novel Byzantine-robust aggregation algorithm to enhance the security +of Decentralized Federated Learning environments, coined WFAgg. This proposal +handles the adverse conditions and strength robustness of dynamic decentralized +topologies at the same time by employing multiple filters to identify and +mitigate Byzantine attacks. Experimental results demonstrate the effectiveness +of the proposed algorithm in maintaining model accuracy and convergence in the +presence of various Byzantine attack scenarios, outperforming state-of-the-art +centralized Byzantine-robust aggregation schemes (such as Multi-Krum or +Clustering). These algorithms are evaluated on an IID image classification +problem in both centralized and decentralized scenarios. + +
+
+ comment: 18 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ Few-shot Pairwise Rank Prompting: An Effective Non-Parametric Retrieval + Model EMNLP 2024 + + +
+ A supervised ranking model, despite its advantage of being effective, usually +involves complex processing - typically multiple stages of task-specific +pre-training and fine-tuning. This has motivated researchers to explore simpler +pipelines leveraging large language models (LLMs) that are capable of working +in a zero-shot manner. However, since zero-shot inference does not make use of +a training set of pairs of queries and their relevant documents, its +performance is mostly worse than that of supervised models, which are trained +on such example pairs. Motivated by the existing findings that training +examples generally improve zero-shot performance, in our work, we explore if +this also applies to ranking models. More specifically, given a query and a +pair of documents, the preference prediction task is improved by augmenting +examples of preferences for similar queries from a training set. Our proposed +pairwise few-shot ranker demonstrates consistent improvements over the +zero-shot baseline on both in-domain (TREC DL) and out-domain (BEIR subset) +retrieval benchmarks. Our method also achieves a close performance to that of a +supervised model without requiring any complex training pipeline. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ☆ Autoregressive Generation Strategies for Top-K Sequential + Recommendations + + +
+ The goal of modern sequential recommender systems is often formulated in +terms of next-item prediction. In this paper, we explore the applicability of +generative transformer-based models for the Top-K sequential recommendation +task, where the goal is to predict items a user is likely to interact with in +the "near future". + We explore commonly used autoregressive generation strategies, including +greedy decoding, beam search, and temperature sampling, to evaluate their +performance for the Top-K sequential recommendation task. In addition, we +propose novel Reciprocal Rank Aggregation (RRA) and Relevance Aggregation (RA) +generation strategies based on multi-sequence generation with temperature +sampling and subsequent aggregation. + Experiments on diverse datasets give valuable insights regarding commonly +used strategies' applicability and show that suggested approaches improve +performance on longer time horizons compared to widely-used Top-K prediction +approach and single-sequence autoregressive generation strategies. + +
+
+
+
+
+ + ☆ Recent advances in interpretable machine learning using structure-based + protein representations + + +
+ Recent advancements in machine learning (ML) are transforming the field of +structural biology. For example, AlphaFold, a groundbreaking neural network for +protein structure prediction, has been widely adopted by researchers. The +availability of easy-to-use interfaces and interpretable outcomes from the +neural network architecture, such as the confidence scores used to color the +predicted structures, have made AlphaFold accessible even to non-ML experts. In +this paper, we present various methods for representing protein 3D structures +from low- to high-resolution, and show how interpretable ML methods can support +tasks such as predicting protein structures, protein function, and +protein-protein interactions. This survey also emphasizes the significance of +interpreting and visualizing ML-based inference for structure-based protein +representations that enhance interpretability and knowledge discovery. +Developing such interpretable approaches promises to further accelerate fields +including drug development and protein design. + +
+
+
+
+
+ + ☆ QuForge: A Library for Qudits Simulation + + +
+ Quantum computing with qudits, an extension of qubits to multiple levels, is +a research field less mature than qubit-based quantum computing. However, +qudits can offer some advantages over qubits, by representing information with +fewer separated components. In this article, we present QuForge, a Python-based +library designed to simulate quantum circuits with qudits. This library +provides the necessary quantum gates for implementing quantum algorithms, +tailored to any chosen qudit dimension. Built on top of differentiable +frameworks, QuForge supports execution on accelerating devices such as GPUs and +TPUs, significantly speeding up simulations. It also supports sparse +operations, leading to a reduction in memory consumption compared to other +libraries. Additionally, by constructing quantum circuits as differentiable +graphs, QuForge facilitates the implementation of quantum machine learning +algorithms, enhancing the capabilities and flexibility of quantum computing +research. + +
+
+ comment: 18 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Pointwise-Pairwise Learning-to-Rank for News Recommendation + + +
+ News recommendation is a challenging task that involves personalization based +on the interaction history and preferences of each user. Recent works have +leveraged the power of pretrained language models (PLMs) to directly rank news +items by using inference approaches that predominately fall into three +categories: pointwise, pairwise, and listwise learning-to-rank. While pointwise +methods offer linear inference complexity, they fail to capture crucial +comparative information between items that is more effective for ranking tasks. +Conversely, pairwise and listwise approaches excel at incorporating these +comparisons but suffer from practical limitations: pairwise approaches are +either computationally expensive or lack theoretical guarantees, and listwise +methods often perform poorly in practice. In this paper, we propose a novel +framework for PLM-based news recommendation that integrates both pointwise +relevance prediction and pairwise comparisons in a scalable manner. We present +a rigorous theoretical analysis of our framework, establishing conditions under +which our approach guarantees improved performance. Extensive experiments show +that our approach outperforms the state-of-the-art methods on the MIND and +Adressa news recommendation datasets. + +
+
+
+
+
+ + ☆ Transfer Learning in $\ell_1$ Regularized Regression: Hyperparameter + Selection Strategy based on Sharp Asymptotic Analysis + + +
+ Transfer learning techniques aim to leverage information from multiple +related datasets to enhance prediction quality against a target dataset. Such +methods have been adopted in the context of high-dimensional sparse regression, +and some Lasso-based algorithms have been invented: Trans-Lasso and Pretraining +Lasso are such examples. These algorithms require the statistician to select +hyperparameters that control the extent and type of information transfer from +related datasets. However, selection strategies for these hyperparameters, as +well as the impact of these choices on the algorithm's performance, have been +largely unexplored. To address this, we conduct a thorough, precise study of +the algorithm in a high-dimensional setting via an asymptotic analysis using +the replica method. Our approach reveals a surprisingly simple behavior of the +algorithm: Ignoring one of the two types of information transferred to the +fine-tuning stage has little effect on generalization performance, implying +that efforts for hyperparameter selection can be significantly reduced. Our +theoretical findings are also empirically supported by real-world applications +on the IMDb dataset. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ PGN: The RNN's New Successor is Effective for Long-Range Time Series + Forecasting + + +
+ Due to the recurrent structure of RNN, the long information propagation path +poses limitations in capturing long-term dependencies, gradient +explosion/vanishing issues, and inefficient sequential execution. Based on +this, we propose a novel paradigm called Parallel Gated Network (PGN) as the +new successor to RNN. PGN directly captures information from previous time +steps through the designed Historical Information Extraction (HIE) layer and +leverages gated mechanisms to select and fuse it with the current time step +information. This reduces the information propagation path to $\mathcal{O}(1)$, +effectively addressing the limitations of RNN. To enhance PGN's performance in +long-range time series forecasting tasks, we propose a novel temporal modeling +framework called Temporal PGN (TPGN). TPGN incorporates two branches to +comprehensively capture the semantic information of time series. One branch +utilizes PGN to capture long-term periodic patterns while preserving their +local characteristics. The other branch employs patches to capture short-term +information and aggregate the global representation of the series. TPGN +achieves a theoretical complexity of $\mathcal{O}(\sqrt{L})$, ensuring +efficiency in its operations. Experimental results on five benchmark datasets +demonstrate the state-of-the-art (SOTA) performance and high efficiency of +TPGN, further confirming the effectiveness of PGN as the new successor to RNN +in long-range time series forecasting. The code is available in this +repository: \url{https://github.com/Water2sea/TPGN}. + +
+
+
+
+
+ + ☆ MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard + for Prompt Attacks + + +
+ The proliferation of Large Language Models (LLMs) in diverse applications +underscores the pressing need for robust security measures to thwart potential +jailbreak attacks. These attacks exploit vulnerabilities within LLMs, endanger +data integrity and user privacy. Guardrails serve as crucial protective +mechanisms against such threats, but existing models often fall short in terms +of both detection accuracy, and computational efficiency. This paper advocates +for the significance of jailbreak attack prevention on LLMs, and emphasises the +role of input guardrails in safeguarding these models. We introduce MoJE +(Mixture of Jailbreak Expert), a novel guardrail architecture designed to +surpass current limitations in existing state-of-the-art guardrails. By +employing simple linguistic statistical techniques, MoJE excels in detecting +jailbreak attacks while maintaining minimal computational overhead during model +inference. Through rigorous experimentation, MoJE demonstrates superior +performance capable of detecting 90% of the attacks without compromising benign +prompts, enhancing LLMs security against jailbreak attacks. + +
+
+
+
+
+ + ☆ MIO: A Foundation Model on Multimodal Tokens + + +
+ In this paper, we introduce MIO, a novel foundation model built on multimodal +tokens, capable of understanding and generating speech, text, images, and +videos in an end-to-end, autoregressive manner. While the emergence of large +language models (LLMs) and multimodal large language models (MM-LLMs) propels +advancements in artificial general intelligence through their versatile +capabilities, they still lack true any-to-any understanding and generation. +Recently, the release of GPT-4o has showcased the remarkable potential of +any-to-any LLMs for complex real-world tasks, enabling omnidirectional input +and output across images, speech, and text. However, it is closed-source and +does not support the generation of multimodal interleaved sequences. To address +this gap, we present MIO, which is trained on a mixture of discrete tokens +across four modalities using causal multimodal modeling. MIO undergoes a +four-stage training process: (1) alignment pre-training, (2) interleaved +pre-training, (3) speech-enhanced pre-training, and (4) comprehensive +supervised fine-tuning on diverse textual, visual, and speech tasks. Our +experimental results indicate that MIO exhibits competitive, and in some cases +superior, performance compared to previous dual-modal baselines, any-to-any +model baselines, and even modality-specific baselines. Moreover, MIO +demonstrates advanced capabilities inherent to its any-to-any feature, such as +interleaved video-text generation, chain-of-visual-thought reasoning, visual +guideline generation, instructional image editing, etc. + +
+
+ comment: Technical Report. Codes and models will be available soon +
+
+
+
+
+ + ☆ Efficient Bias Mitigation Without Privileged Information ECCV + 2024 + + +
+ Deep neural networks trained via empirical risk minimisation often exhibit +significant performance disparities across groups, particularly when group and +task labels are spuriously correlated (e.g., "grassy background" and "cows"). +Existing bias mitigation methods that aim to address this issue often either +rely on group labels for training or validation, or require an extensive +hyperparameter search. Such data and computational requirements hinder the +practical deployment of these methods, especially when datasets are too large +to be group-annotated, computational resources are limited, and models are +trained through already complex pipelines. In this paper, we propose Targeted +Augmentations for Bias Mitigation (TAB), a simple hyperparameter-free framework +that leverages the entire training history of a helper model to identify +spurious samples, and generate a group-balanced training set from which a +robust model can be trained. We show that TAB improves worst-group performance +without any group information or model selection, outperforming existing +methods while maintaining overall accuracy. + +
+
+ comment: Accepted at the 18th European Conference on Computer Vision (ECCV + 2024) as an Oral presentation +
+
+
+
+
+ + ☆ Graph Edit Distance with General Costs Using Neural Set Divergence NeurIPS 2024 + + +
+ Graph Edit Distance (GED) measures the (dis-)similarity between two given +graphs, in terms of the minimum-cost edit sequence that transforms one graph to +the other. However, the exact computation of GED is NP-Hard, which has recently +motivated the design of neural methods for GED estimation. However, they do not +explicitly account for edit operations with different costs. In response, we +propose GRAPHEDX, a neural GED estimator that can work with general costs +specified for the four edit operations, viz., edge deletion, edge addition, +node deletion and node addition. We first present GED as a quadratic assignment +problem (QAP) that incorporates these four costs. Then, we represent each graph +as a set of node and edge embeddings and use them to design a family of neural +set divergence surrogates. We replace the QAP terms corresponding to each +operation with their surrogates. Computing such neural set divergence require +aligning nodes and edges of the two graphs. We learn these alignments using a +Gumbel-Sinkhorn permutation generator, additionally ensuring that the node and +edge alignments are consistent with each other. Moreover, these alignments are +cognizant of both the presence and absence of edges between node-pairs. +Experiments on several datasets, under a variety of edit cost settings, show +that GRAPHEDX consistently outperforms state-of-the-art methods and heuristics +in terms of prediction error. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Artificial Data Point Generation in Clustered Latent Space for Small + Medical Datasets + + +
+ One of the growing trends in machine learning is the use of data generation +techniques, since the performance of machine learning models is dependent on +the quantity of the training dataset. However, in many medical applications, +collecting large datasets is challenging due to resource constraints, which +leads to overfitting and poor generalization. This paper introduces a novel +method, Artificial Data Point Generation in Clustered Latent Space (AGCL), +designed to enhance classification performance on small medical datasets +through synthetic data generation. The AGCL framework involves feature +extraction, K-means clustering, cluster evaluation based on a class separation +metric, and the generation of synthetic data points from clusters with distinct +class representations. This method was applied to Parkinson's disease +screening, utilizing facial expression data, and evaluated across multiple +machine learning classifiers. Experimental results demonstrate that AGCL +significantly improves classification accuracy compared to baseline, GN and +kNNMTD. AGCL achieved the highest overall test accuracy of 83.33% and +cross-validation accuracy of 90.90% in majority voting over different emotions, +confirming its effectiveness in augmenting small datasets. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Preserving logical and functional dependencies in synthetic tabular data + + +
+ Dependencies among attributes are a common aspect of tabular data. However, +whether existing tabular data generation algorithms preserve these dependencies +while generating synthetic data is yet to be explored. In addition to the +existing notion of functional dependencies, we introduce the notion of logical +dependencies among the attributes in this article. Moreover, we provide a +measure to quantify logical dependencies among attributes in tabular data. +Utilizing this measure, we compare several state-of-the-art synthetic data +generation algorithms and test their capability to preserve logical and +functional dependencies on several publicly available datasets. We demonstrate +that currently available synthetic tabular data generation algorithms do not +fully preserve functional dependencies when they generate synthetic datasets. +In addition, we also showed that some tabular synthetic data generation models +can preserve inter-attribute logical dependencies. Our review and comparison of +the state-of-the-art reveal research needs and opportunities to develop +task-specific synthetic tabular data generation models. + +
+
+ comment: Submitted to Pattern Recognition Journal +
+
+
+
+
+ + ☆ Optimal Memorization Capacity of Transformers + + +
+ Recent research in the field of machine learning has increasingly focused on +the memorization capacity of Transformers, but how efficient they are is not +yet well understood. We demonstrate that Transformers can memorize labels with +$\tilde{O}(\sqrt{N})$ parameters in a next-token prediction setting for $N$ +input sequences of length $n$, which is proved to be optimal up to logarithmic +factors. This indicates that Transformers can efficiently perform memorization +with little influence from the input length $n$ owing to the benefit of +parameter sharing. We also analyze the memorization capacity in the +sequence-to-sequence setting, and find that $\tilde{O}(\sqrt{nN})$ parameters +are not only sufficient, but also necessary at least for Transformers with +hardmax. These results suggest that while self-attention mechanisms can +efficiently identify input sequences, the feed-forward network becomes a +bottleneck when associating a label to each token. + +
+
+
+
+
+ + ☆ Explanation Bottleneck Models + + +
+ Recent concept-based interpretable models have succeeded in providing +meaningful explanations by pre-defined concept sets. However, the dependency on +the pre-defined concepts restricts the application because of the limited +number of concepts for explanations. This paper proposes a novel interpretable +deep neural network called explanation bottleneck models (XBMs). XBMs generate +a text explanation from the input without pre-defined concepts and then predict +a final task prediction based on the generated explanation by leveraging +pre-trained vision-language encoder-decoder models. To achieve both the target +task performance and the explanation quality, we train XBMs through the target +task loss with the regularization penalizing the explanation decoder via the +distillation from the frozen pre-trained decoder. Our experiments, including a +comparison to state-of-the-art concept bottleneck models, confirm that XBMs +provide accurate and fluent natural language explanations without pre-defined +concept sets. Code will be available at https://github.com/yshinya6/xbm/. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Efficient Fairness-Performance Pareto Front Computation + + +
+ There is a well known intrinsic trade-off between the fairness of a +representation and the performance of classifiers derived from the +representation. Due to the complexity of optimisation algorithms in most modern +representation learning approaches, for a given method it may be non-trivial to +decide whether the obtained fairness-performance curve of the method is +optimal, i.e., whether it is close to the true Pareto front for these +quantities for the underlying data distribution. + In this paper we propose a new method to compute the optimal Pareto front, +which does not require the training of complex representation models. We show +that optimal fair representations possess several useful structural properties, +and that these properties enable a reduction of the computation of the Pareto +Front to a compact discrete problem. We then also show that these compact +approximating problems can be efficiently solved via off-the shelf +concave-convex programming methods. + Since our approach is independent of the specific model of representations, +it may be used as the benchmark to which representation learning algorithms may +be compared. We experimentally evaluate the approach on a number of real world +benchmark datasets. + +
+
+
+
+
+ + ☆ FlowMAC: Conditional Flow Matching for Audio Coding at Low Bit Rates ICASSP 2025 + + +
+ This paper introduces FlowMAC, a novel neural audio codec for high-quality +general audio compression at low bit rates based on conditional flow matching +(CFM). FlowMAC jointly learns a mel spectrogram encoder, quantizer and decoder. +At inference time the decoder integrates a continuous normalizing flow via an +ODE solver to generate a high-quality mel spectrogram. This is the first time +that a CFM-based approach is applied to general audio coding, enabling a +scalable, simple and memory efficient training. Our subjective evaluations show +that FlowMAC at 3 kbps achieves similar quality as state-of-the-art GAN-based +and DDPM-based neural audio codecs at double the bit rate. Moreover, FlowMAC +offers a tunable inference pipeline, which permits to trade off complexity and +quality. This enables real-time coding on CPU, while maintaining high +perceptual quality. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Model-Free Stochastic Process Modeling and Optimization using + Normalizing Flows + + +
+ Real-world chemical processes often exhibit stochastic dynamics with +non-trivial correlations and state-dependent fluctuations. However, most +process models simply add stationary noise terms to a deterministic prediction, +which can lead to inaccurate predictions. This work proposes using conditional +normalizing flows as discrete-time models (DTMs) to learn the stochastic +dynamics of chemical processes. Normalizing flows learn an explicit expression +of the system states' probability density function (PDF) given prior states and +control inputs. The resulting model naturally allows for formulating stochastic +and probabilistic setpoint-tracking objectives and chance constraints. In +applications to a continuous reactor and a reactor cascade, the normalizing +flow yields stable simulations over long time horizons and high-quality results +in stochastic and probabilistic MPC formulation for open-loop control. +Furthermore, a chance-constrained optimization finds reliable startup controls +for the reactor cascade with stochastic reactions. In conclusion, the +conditional normalizing flow presents an excellent choice for modeling +nonlinear stochastic dynamics. + +
+
+ comment: 13 pages, 7 Figures, 5 Tables +
+
+
+
+
+ + ☆ Convolutional Signal Propagation: A Simple Scalable Algorithm for + Hypergraphs + + +
+ Last decade has seen the emergence of numerous methods for learning on +graphs, particularly Graph Neural Networks (GNNs). These methods, however, are +often not directly applicable to more complex structures like bipartite graphs +(equivalent to hypergraphs), which represent interactions among two entity +types (e.g. a user liking a movie). This paper proposes Convolutional Signal +Propagation (CSP), a non-parametric simple and scalable method that natively +operates on bipartite graphs (hypergraphs) and can be implemented with just a +few lines of code. After defining CSP, we demonstrate its relationship with +well-established methods like label propagation, Naive Bayes, and Hypergraph +Convolutional Networks. We evaluate CSP against several reference methods on +real-world datasets from multiple domains, focusing on retrieval and +classification tasks. Our results show that CSP offers competitive performance +while maintaining low computational complexity, making it an ideal first choice +as a baseline for hypergraph node classification and retrieval. Moreover, +despite operating on hypergraphs, CSP achieves good results in tasks typically +not associated with hypergraphs, such as natural language processing. + +
+
+
+
+
+ + ☆ Benign or Not-Benign Overfitting in Token Selection of Attention + Mechanism + + +
+ Modern over-parameterized neural networks can be trained to fit the training +data perfectly while still maintaining a high generalization performance. This +"benign overfitting" phenomenon has been studied in a surge of recent +theoretical work; however, most of these studies have been limited to linear +models or two-layer neural networks. In this work, we analyze benign +overfitting in the token selection mechanism of the attention architecture, +which characterizes the success of transformer models. We first show the +existence of a benign overfitting solution and explain its mechanism in the +attention architecture. Next, we discuss whether the model converges to such a +solution, raising the difficulties specific to the attention architecture. We +then present benign overfitting cases and not-benign overfitting cases by +conditioning different scenarios based on the behavior of attention +probabilities during training. To the best of our knowledge, this is the first +study to characterize benign overfitting for the attention mechanism. + +
+
+
+
+
+ + ☆ Neural P$^3$M: A Long-Range Interaction Modeling Enhancer for Geometric + GNNs NeurIPS 2024 + + +
+ Geometric graph neural networks (GNNs) have emerged as powerful tools for +modeling molecular geometry. However, they encounter limitations in effectively +capturing long-range interactions in large molecular systems. To address this +challenge, we introduce Neural P$^3$M, a versatile enhancer of geometric GNNs +to expand the scope of their capabilities by incorporating mesh points +alongside atoms and reimaging traditional mathematical operations in a +trainable manner. Neural P$^3$M exhibits flexibility across a wide range of +molecular systems and demonstrates remarkable accuracy in predicting energies +and forces, outperforming on benchmarks such as the MD22 dataset. It also +achieves an average improvement of 22% on the OE62 dataset while integrating +with various architectures. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic dataset that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. + +
+
+
+
+
+ + ☆ Good Data Is All Imitation Learning Needs + + +
+ In this paper, we address the limitations of traditional teacher-student +models, imitation learning, and behaviour cloning in the context of +Autonomous/Automated Driving Systems (ADS), where these methods often struggle +with incomplete coverage of real-world scenarios. To enhance the robustness of +such models, we introduce the use of Counterfactual Explanations (CFEs) as a +novel data augmentation technique for end-to-end ADS. CFEs, by generating +training samples near decision boundaries through minimal input modifications, +lead to a more comprehensive representation of expert driver strategies, +particularly in safety-critical scenarios. This approach can therefore help +improve the model's ability to handle rare and challenging driving events, such +as anticipating darting out pedestrians, ultimately leading to safer and more +trustworthy decision-making for ADS. Our experiments in the CARLA simulator +demonstrate that CF-Driver outperforms the current state-of-the-art method, +achieving a higher driving score and lower infraction rates. Specifically, +CF-Driver attains a driving score of 84.2, surpassing the previous best model +by 15.02 percentage points. These results highlight the effectiveness of +incorporating CFEs in training end-to-end ADS. To foster further research, the +CF-Driver code is made publicly available. + +
+
+
+
+
+ + ☆ RmGPT: Rotating Machinery Generative Pretrained Model + + +
+ In industry, the reliability of rotating machinery is critical for production +efficiency and safety. Current methods of Prognostics and Health Management +(PHM) often rely on task-specific models, which face significant challenges in +handling diverse datasets with varying signal characteristics, fault modes and +operating conditions. Inspired by advancements in generative pretrained models, +we propose RmGPT, a unified model for diagnosis and prognosis tasks. RmGPT +introduces a novel token-based framework, incorporating Signal Tokens, Prompt +Tokens, Time-Frequency Task Tokens and Fault Tokens to handle heterogeneous +data within a unified model architecture. We leverage self-supervised learning +for robust feature extraction and introduce a next signal token prediction +pretraining strategy, alongside efficient prompt learning for task-specific +adaptation. Extensive experiments demonstrate that RmGPT significantly +outperforms state-of-the-art algorithms, achieving near-perfect accuracy in +diagnosis tasks and exceptionally low errors in prognosis tasks. Notably, RmGPT +excels in few-shot learning scenarios, achieving 92% accuracy in 16-class +one-shot experiments, highlighting its adaptability and robustness. This work +establishes RmGPT as a powerful PHM foundation model for rotating machinery, +advancing the scalability and generalizability of PHM solutions. + +
+
+
+
+
+ + ☆ Deep Manifold Part 1: Anatomy of Neural Network Manifold + + +
+ Based on the numerical manifold method principle, we developed a mathematical +framework of a neural network manifold: Deep Manifold and discovered that +neural networks: 1) is numerical computation combining forward and inverse; 2) +have near infinite degrees of freedom; 3) exponential learning capacity with +depth; 4) have self-progressing boundary conditions; 5) has training hidden +bottleneck. We also define two concepts: neural network learning space and deep +manifold space and introduce two concepts: neural network intrinsic pathway and +fixed point. We raise three fundamental questions: 1). What is the training +completion definition; 2). where is the deep learning convergence point (neural +network fixed point); 3). How important is token timestamp in training data +given negative time is critical in inverse problem. + +
+
+
+
+
+ + ☆ Conjugate Bayesian Two-step Change Point Detection for Hawkes Process NeurIPS 2024 + + +
+ The Bayesian two-step change point detection method is popular for the Hawkes +process due to its simplicity and intuitiveness. However, the non-conjugacy +between the point process likelihood and the prior requires most existing +Bayesian two-step change point detection methods to rely on non-conjugate +inference methods. These methods lack analytical expressions, leading to low +computational efficiency and impeding timely change point detection. To address +this issue, this work employs data augmentation to propose a conjugate Bayesian +two-step change point detection method for the Hawkes process, which proves to +be more accurate and efficient. Extensive experiments on both synthetic and +real data demonstrate the superior effectiveness and efficiency of our method +compared to baseline methods. Additionally, we conduct ablation studies to +explore the robustness of our method concerning various hyperparameters. Our +code is publicly available at https://github.com/Aurora2050/CoBay-CPD. + +
+
+ comment: 10 pages, accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Multimodal Banking Dataset: Understanding Client Needs through Event + Sequences + + +
+ Financial organizations collect a huge amount of data about clients that +typically has a temporal (sequential) structure and is collected from various +sources (modalities). Due to privacy issues, there are no large-scale +open-source multimodal datasets of event sequences, which significantly limits +the research in this area. In this paper, we present the industrial-scale +publicly available multimodal banking dataset, MBD, that contains more than +1.5M corporate clients with several modalities: 950M bank transactions, 1B geo +position events, 5M embeddings of dialogues with technical support and monthly +aggregated purchases of four bank's products. All entries are properly +anonymized from real proprietary bank data. Using this dataset, we introduce a +novel benchmark with two business tasks: campaigning (purchase prediction in +the next month) and matching of clients. We provide numerical results that +demonstrate the superiority of our multi-modal baselines over single-modal +techniques for each task. As a result, the proposed dataset can open new +perspectives and facilitate the future development of practically important +large-scale multimodal algorithms for event sequences. + HuggingFace Link: https://huggingface.co/datasets/ai-lab/MBD + Github Link: https://github.com/Dzhambo/MBD + +
+
+
+
+
+ + ☆ Let the Quantum Creep In: Designing Quantum Neural Network Models by + Gradually Swapping Out Classical Components + + +
+ Artificial Intelligence (AI), with its multiplier effect and wide +applications in multiple areas, could potentially be an important application +of quantum computing. Since modern AI systems are often built on neural +networks, the design of quantum neural networks becomes a key challenge in +integrating quantum computing into AI. To provide a more fine-grained +characterisation of the impact of quantum components on the performance of +neural networks, we propose a framework where classical neural network layers +are gradually replaced by quantum layers that have the same type of input and +output while keeping the flow of information between layers unchanged, +different from most current research in quantum neural network, which favours +an end-to-end quantum model. We start with a simple three-layer classical +neural network without any normalisation layers or activation functions, and +gradually change the classical layers to the corresponding quantum versions. We +conduct numerical experiments on image classification datasets such as the +MNIST, FashionMNIST and CIFAR-10 datasets to demonstrate the change of +performance brought by the systematic introduction of quantum components. +Through this framework, our research sheds new light on the design of future +quantum neural network models where it could be more favourable to search for +methods and frameworks that harness the advantages from both the classical and +quantum worlds. + +
+
+ comment: 50 pages (including Appendix), many figures, accepted as a poster on + QTML2024. Code available at + https://github.com/peiyong-addwater/Let-The-Quantum-Creep-In +
+
+
+
+
+ + ☆ Multiplicative Logit Adjustment Approximates Neural-Collapse-Aware + Decision Boundary Adjustment + + +
+ Real-world data distributions are often highly skewed. This has spurred a +growing body of research on long-tailed recognition to address this imbalance +in training classification models. Among the methods studied, multiplicative +logit adjustment (MLA) stands out as a simple and effective method. However, it +lacks theoretical guarantees, which raises concerns about the optimality of its +adjustment method. We provide a theoretical justification for the effectiveness +of MLA with the following two-step theory. First, we develop a theory that +adjusts optimal decision boundaries by estimating feature spread on the basis +of neural collapse. Then, we demonstrate that MLA approximates this optimal +method. Additionally, through experiments on long-tailed datasets, we +illustrate the practical usefulness of MLA under more realistic conditions. We +also offer experimental insights to guide the tuning of MLA's hyperparameters. + +
+
+
+
+
+ + ☆ Derandomizing Multi-Distribution Learning + + +
+ Multi-distribution or collaborative learning involves learning a single +predictor that works well across multiple data distributions, using samples +from each during training. Recent research on multi-distribution learning, +focusing on binary loss and finite VC dimension classes, has shown near-optimal +sample complexity that is achieved with oracle efficient algorithms. That is, +these algorithms are computationally efficient given an efficient ERM for the +class. Unlike in classical PAC learning, where the optimal sample complexity is +achieved with deterministic predictors, current multi-distribution learning +algorithms output randomized predictors. This raises the question: can these +algorithms be derandomized to produce a deterministic predictor for multiple +distributions? Through a reduction to discrepancy minimization, we show that +derandomizing multi-distribution learning is computationally hard, even when +ERM is computationally efficient. On the positive side, we identify a +structural condition enabling an efficient black-box reduction, converting +existing randomized multi-distribution predictors into deterministic ones. + +
+
+
+
+
+ + ☆ Pixel-Space Post-Training of Latent Diffusion Models + + +
+ Latent diffusion models (LDMs) have made significant advancements in the +field of image generation in recent years. One major advantage of LDMs is their +ability to operate in a compressed latent space, allowing for more efficient +training and deployment. However, despite these advantages, challenges with +LDMs still remain. For example, it has been observed that LDMs often generate +high-frequency details and complex compositions imperfectly. We hypothesize +that one reason for these flaws is due to the fact that all pre- and +post-training of LDMs are done in latent space, which is typically $8 \times 8$ +lower spatial-resolution than the output images. To address this issue, we +propose adding pixel-space supervision in the post-training process to better +preserve high-frequency details. Experimentally, we show that adding a +pixel-space objective significantly improves both supervised quality +fine-tuning and preference-based post-training by a large margin on a +state-of-the-art DiT transformer and U-Net diffusion models in both visual +quality and visual flaw metrics, while maintaining the same text alignment +quality. + +
+
+
+
+
+ + ☆ Joint Source-Channel Coding: Fundamentals and Recent Progress in + Practical Designs + + +
+ Semantic- and task-oriented communication has emerged as a promising approach +to reducing the latency and bandwidth requirements of next-generation mobile +networks by transmitting only the most relevant information needed to complete +a specific task at the receiver. This is particularly advantageous for +machine-oriented communication of high data rate content, such as images and +videos, where the goal is rapid and accurate inference, rather than perfect +signal reconstruction. While semantic- and task-oriented compression can be +implemented in conventional communication systems, joint source-channel coding +(JSCC) offers an alternative end-to-end approach by optimizing compression and +channel coding together, or even directly mapping the source signal to the +modulated waveform. Although all digital communication systems today rely on +separation, thanks to its modularity, JSCC is known to achieve higher +performance in finite blocklength scenarios, and to avoid cliff and the +levelling-off effects in time-varying channel scenarios. This article provides +an overview of the information theoretic foundations of JSCC, surveys practical +JSCC designs over the decades, and discusses the reasons for their limited +adoption in practical systems. We then examine the recent resurgence of JSCC, +driven by the integration of deep learning techniques, particularly through +DeepJSCC, highlighting its many surprising advantages in various scenarios. +Finally, we discuss why it may be time to reconsider today's strictly separate +architectures, and reintroduce JSCC to enable high-fidelity, low-latency +communications in critical applications such as autonomous driving, drone +surveillance, or wearable systems. + +
+
+ comment: Under review for possible publication +
+
+
+
+
+ + ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code will be +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code will be available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ☆ MASSFormer: Mobility-Aware Spectrum Sensing using Transformer-Driven + Tiered Structure + + +
+ In this paper, we develop a novel mobility-aware transformer-driven tiered +structure (MASSFormer) based cooperative spectrum sensing method that +effectively models the spatio-temporal dynamics of user movements. Unlike +existing methods, our method considers a dynamic scenario involving mobile +primary users (PUs) and secondary users (SUs)and addresses the complexities +introduced by user mobility. The transformer architecture utilizes an attention +mechanism, enabling the proposed method to adeptly model the temporal dynamics +of user mobility by effectively capturing long-range dependencies within the +input data. The proposed method first computes tokens from the sequence of +covariance matrices (CMs) for each SU and processes them in parallel using the +SUtransformer network to learn the spatio-temporal features at SUlevel. +Subsequently, the collaborative transformer network learns the group-level PU +state from all SU-level feature representations. The attention-based sequence +pooling method followed by the transformer encoder adjusts the contributions of +all tokens. The main goal of predicting the PU states at each SU-level and +group-level is to improve detection performance even more. We conducted a +sufficient amount of simulations and compared the detection performance of +different SS methods. The proposed method is tested under imperfect reporting +channel scenarios to show robustness. The efficacy of our method is validated +with the simulation results demonstrating its higher performance compared with +existing methods in terms of detection probability, sensing error, and +classification accuracy. + +
+
+
+
+
+ + ☆ Modulated Intervention Preference Optimization (MIPO): Keey the Easy, + Refine the Difficult AAAI 2025 + + +
+ Preference optimization methods typically begin training with a well-trained +SFT model as a reference model. In RLHF and DPO, a regularization term is used +during the preference optimization process to prevent the policy model from +deviating too far from the reference model's distribution, thereby avoiding the +generation of anomalous responses. When the reference model is already +well-aligned with the given data or only requires slight adjustments, this +approach can produce a well-aligned model. However, if the reference model is +not aligned with the given data and requires significant deviation from its +current state, a regularization term may actually hinder the model alignment. +In this study, we propose \textbf{Modulated Intervention Preference +Optimization (MIPO)} to address this issue. MIPO modulates the degree of +intervention from the reference model based on how well the given data is +aligned with it. If the data is well-aligned, the intervention is increased to +prevent the policy model from diverging significantly from reference model. +Conversely, if the alignment is poor, the interference is reduced to facilitate +more extensive training. We compare the performance of MIPO and DPO using +Mistral-7B and Llama3-8B in Alpaca Eval 2.0 and MT-Bench. The experimental +results demonstrate that MIPO consistently outperforms DPO across various +evaluation scenarios. + +
+
+ comment: 8pages, submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Optimizing the Induced Correlation in Omnibus Joint Graph Embeddings + + +
+ Theoretical and empirical evidence suggests that joint graph embedding +algorithms induce correlation across the networks in the embedding space. In +the Omnibus joint graph embedding framework, previous results explicitly +delineated the dual effects of the algorithm-induced and model-inherent +correlations on the correlation across the embedded networks. Accounting for +and mitigating the algorithm-induced correlation is key to subsequent +inference, as sub-optimal Omnibus matrix constructions have been demonstrated +to lead to loss in inference fidelity. This work presents the first efforts to +automate the Omnibus construction in order to address two key questions in this +joint embedding framework: the correlation-to-OMNI problem and the flat +correlation problem. In the flat correlation problem, we seek to understand the +minimum algorithm-induced flat correlation (i.e., the same across all graph +pairs) produced by a generalized Omnibus embedding. Working in a subspace of +the fully general Omnibus matrices, we prove both a lower bound for this flat +correlation and that the classical Omnibus construction induces the maximal +flat correlation. In the correlation-to-OMNI problem, we present an algorithm +-- named corr2Omni -- that, from a given matrix of estimated pairwise graph +correlations, estimates the matrix of generalized Omnibus weights that induces +optimal correlation in the embedding space. Moreover, in both simulated and +real data settings, we demonstrate the increased effectiveness of our corr2Omni +algorithm versus the classical Omnibus construction. + +
+
+ comment: 34 pages, 8 figures +
+
+
+
+
+ + ☆ On the Implicit Relation Between Low-Rank Adaptation and Differential + Privacy + + +
+ A significant approach in natural language processing involves large-scale +pre-training on general domain data followed by adaptation to specific tasks or +domains. As models grow in size, full fine-tuning all parameters becomes +increasingly impractical. To address this, some methods for low-rank task +adaptation of language models have been proposed, e.g. LoRA and FLoRA. These +methods keep the pre-trained model weights fixed and incorporate trainable +low-rank decomposition matrices into some layers of the transformer +architecture, called adapters. This approach significantly reduces the number +of trainable parameters required for downstream tasks compared to full +fine-tuning all parameters. In this work, we look at low-rank adaptation from +the lens of data privacy. We show theoretically that the low-rank adaptation +used in LoRA and FLoRA is equivalent to injecting some random noise into the +batch gradients w.r.t the adapter parameters coming from their full +fine-tuning, and we quantify the variance of the injected noise. By +establishing a Berry-Esseen type bound on the total variation distance between +the noise distribution and a Gaussian distribution with the same variance, we +show that the dynamics of LoRA and FLoRA are very close to differentially +private full fine-tuning the adapters, which suggests that low-rank adaptation +implicitly provides privacy w.r.t the fine-tuning data. Finally, using +Johnson-Lindenstrauss lemma, we show that when augmented with gradient +clipping, low-rank adaptation is almost equivalent to differentially private +full fine-tuning adapters with a fixed noise scale. + +
+
+
+
+
+ + ☆ Dataset Distillation-based Hybrid Federated Learning on Non-IID Data + + +
+ In federated learning, the heterogeneity of client data has a great impact on +the performance of model training. Many heterogeneity issues in this process +are raised by non-independently and identically distributed (Non-IID) data. +This study focuses on the issue of label distribution skew. To address it, we +propose a hybrid federated learning framework called HFLDD, which integrates +dataset distillation to generate approximately independent and equally +distributed (IID) data, thereby improving the performance of model training. +Particularly, we partition the clients into heterogeneous clusters, where the +data labels among different clients within a cluster are unbalanced while the +data labels among different clusters are balanced. The cluster headers collect +distilled data from the corresponding cluster members, and conduct model +training in collaboration with the server. This training process is like +traditional federated learning on IID data, and hence effectively alleviates +the impact of Non-IID data on model training. Furthermore, we compare our +proposed method with typical baseline methods on public datasets. Experimental +results demonstrate that when the data labels are severely imbalanced, the +proposed HFLDD outperforms the baseline methods in terms of both test accuracy +and communication cost. + +
+
+
+
+
+ + ☆ Functional Classification of Spiking Signal Data Using Artificial + Intelligence Techniques: A Review + + +
+ Human brain neuron activities are incredibly significant nowadays. Neuronal +behavior is assessed by analyzing signal data such as electroencephalography +(EEG), which can offer scientists valuable information about diseases and +human-computer interaction. One of the difficulties researchers confront while +evaluating these signals is the existence of large volumes of spike data. +Spikes are some considerable parts of signal data that can happen as a +consequence of vital biomarkers or physical issues such as electrode movements. +Hence, distinguishing types of spikes is important. From this spot, the spike +classification concept commences. Previously, researchers classified spikes +manually. The manual classification was not precise enough as it involves +extensive analysis. Consequently, Artificial Intelligence (AI) was introduced +into neuroscience to assist clinicians in classifying spikes correctly. This +review discusses the importance and use of AI in spike classification, focusing +on the recognition of neural activity noises. The task is divided into three +main components: preprocessing, classification, and evaluation. Existing +methods are introduced and their importance is determined. The review also +highlights the need for more efficient algorithms. The primary goal is to +provide a perspective on spike classification for future research and provide a +comprehensive understanding of the methodologies and issues involved. The +review organizes materials in the spike classification field for future +studies. In this work, numerous studies were extracted from different +databases. The PRISMA-related research guidelines were then used to choose +papers. Then, research studies based on spike classification using machine +learning and deep learning approaches with effective preprocessing were +selected. + +
+
+ comment: 8 figures, 32 pages +
+
+
+
+
+ + ☆ Comparing Unidirectional, Bidirectional, and Word2vec Models for + Discovering Vulnerabilities in Compiled Lifted Code + + +
+ Ransomware and other forms of malware cause significant financial and +operational damage to organizations by exploiting long-standing and often +difficult-to-detect software vulnerabilities. To detect vulnerabilities such as +buffer overflows in compiled code, this research investigates the application +of unidirectional transformer-based embeddings, specifically GPT-2. Using a +dataset of LLVM functions, we trained a GPT-2 model to generate embeddings, +which were subsequently used to build LSTM neural networks to differentiate +between vulnerable and non-vulnerable code. Our study reveals that embeddings +from the GPT-2 model significantly outperform those from bidirectional models +of BERT and RoBERTa, achieving an accuracy of 92.5% and an F1-score of 89.7%. +LSTM neural networks were developed with both frozen and unfrozen embedding +model layers. The model with the highest performance was achieved when the +embedding layers were unfrozen. Further, the research finds that, in exploring +the impact of different optimizers within this domain, the SGD optimizer +demonstrates superior performance over Adam. Overall, these findings reveal +important insights into the potential of unidirectional transformer-based +approaches in enhancing cybersecurity defenses. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ NeuroPath: A Neural Pathway Transformer for Joining the Dots of Human + Connectomes NeurIPS 2024 + + +
+ Although modern imaging technologies allow us to study connectivity between +two distinct brain regions in-vivo, an in-depth understanding of how anatomical +structure supports brain function and how spontaneous functional fluctuations +emerge remarkable cognition is still elusive. Meanwhile, tremendous efforts +have been made in the realm of machine learning to establish the nonlinear +mapping between neuroimaging data and phenotypic traits. However, the absence +of neuroscience insight in the current approaches poses significant challenges +in understanding cognitive behavior from transient neural activities. To +address this challenge, we put the spotlight on the coupling mechanism of +structural connectivity (SC) and functional connectivity (FC) by formulating +such network neuroscience question into an expressive graph representation +learning problem for high-order topology. Specifically, we introduce the +concept of topological detour to characterize how a ubiquitous instance of FC +(direct link) is supported by neural pathways (detour) physically wired by SC, +which forms a cyclic loop interacted by brain structure and function. In the +clich\'e of machine learning, the multi-hop detour pathway underlying SC-FC +coupling allows us to devise a novel multi-head self-attention mechanism within +Transformer to capture multi-modal feature representation from paired graphs of +SC and FC. Taken together, we propose a biological-inspired deep model, coined +as NeuroPath, to find putative connectomic feature representations from the +unprecedented amount of neuroimages, which can be plugged into various +downstream applications such as task recognition and disease diagnosis. We have +evaluated NeuroPath on large-scale public datasets including HCP and UK Biobank +under supervised and zero-shot learning, where the state-of-the-art performance +by our NeuroPath indicates great potential in network neuroscience. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization, recent advances primarily focus on improving the LLM +components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector. Extensive ablation experiments +validate the effectiveness of introducing CMoE under any configuration, with up +to an average 8% performance gains. We further provide interpretation analysis +of the tug-of-war problem from the perspective of gradient optimization and +parameter statistics. Compared to previous state-of-the-art medical MLLMs, +Uni-Med achieves competitive or superior evaluation metrics on diverse tasks. +Code, data and model will be soon available at GitHub. + +
+
+
+
+
+ + ☆ Sequential Kernelized Stein Discrepancy + + +
+ We present a sequential version of the kernelized Stein discrepancy, which +allows for conducting goodness-of-fit tests for unnormalized densities that are +continuously monitored and adaptively stopped. That is, the sample size need +not be fixed prior to data collection; the practitioner can choose whether to +stop the test or continue to gather evidence at any time while controlling the +false discovery rate. In stark contrast to related literature, we do not impose +uniform boundedness on the Stein kernel. Instead, we exploit the potential +boundedness of the Stein kernel at arbitrary point evaluations to define test +martingales, that give way to the subsequent novel sequential tests. We prove +the validity of the test, as well as an asymptotic lower bound for the +logarithmic growth of the wealth process under the alternative. We further +illustrate the empirical performance of the test with a variety of +distributions, including restricted Boltzmann machines. + +
+
+
+
+
+ + ☆ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination + Detection NeurIPS 2024 + + +
+ The surge in applications of large language models (LLMs) has prompted +concerns about the generation of misleading or fabricated information, known as +hallucinations. Therefore, detecting hallucinations has become critical to +maintaining trust in LLM-generated content. A primary challenge in learning a +truthfulness classifier is the lack of a large amount of labeled truthful and +hallucinated data. To address the challenge, we introduce HaloScope, a novel +learning framework that leverages the unlabeled LLM generations in the wild for +hallucination detection. Such unlabeled data arises freely upon deploying LLMs +in the open world, and consists of both truthful and hallucinated information. +To harness the unlabeled data, we present an automated membership estimation +score for distinguishing between truthful and untruthful generations within +unlabeled mixture data, thereby enabling the training of a binary truthfulness +classifier on top. Importantly, our framework does not require extra data +collection and human annotations, offering strong flexibility and practicality +for real-world applications. Extensive experiments show that HaloScope can +achieve superior hallucination detection performance, outperforming the +competitive rivals by a significant margin. Code is available at +https://github.com/deeplearningwisc/haloscope. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ☆ Broadcast Product: Shape-aligned Element-wise Multiplication and Beyond + + +
+ We propose a new operator defined between two tensors, the broadcast product. +The broadcast product calculates the Hadamard product after duplicating +elements to align the shapes of the two tensors. Complex tensor operations in +libraries like \texttt{numpy} can be succinctly represented as mathematical +expressions using the broadcast product. Finally, we propose a novel tensor +decomposition using the broadcast product, highlighting its potential +applications in dimensionality reduction. + +
+
+
+
+
+ + ☆ Does Worst-Performing Agent Lead the Pack? Analyzing Agent Dynamics in + Unified Distributed SGD NeurIPS 2024 + + +
+ Distributed learning is essential to train machine learning algorithms across +heterogeneous agents while maintaining data privacy. We conduct an asymptotic +analysis of Unified Distributed SGD (UD-SGD), exploring a variety of +communication patterns, including decentralized SGD and local SGD within +Federated Learning (FL), as well as the increasing communication interval in +the FL setting. In this study, we assess how different sampling strategies, +such as i.i.d. sampling, shuffling, and Markovian sampling, affect the +convergence speed of UD-SGD by considering the impact of agent dynamics on the +limiting covariance matrix as described in the Central Limit Theorem (CLT). Our +findings not only support existing theories on linear speedup and asymptotic +network independence, but also theoretically and empirically show how efficient +sampling strategies employed by individual agents contribute to overall +convergence in UD-SGD. Simulations reveal that a few agents using highly +efficient sampling can achieve or surpass the performance of the majority +employing moderately improved strategies, providing new insights beyond +traditional analyses focusing on the worst-performing agent. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ☆ MathDSL: A Domain-Specific Language for Concise Mathematical Solutions + Via Program Synthesis + + +
+ We present MathDSL, a Domain-Specific Language (DSL) for mathematical +equation solving, which, when deployed in program synthesis models, outperforms +state-of-the-art reinforcement-learning-based methods. We also introduce a +quantitative metric for measuring the conciseness of a mathematical solution +and demonstrate the improvement in the quality of generated solutions compared +to other methods. Our system demonstrates that a program synthesis system +(DreamCoder) using MathDSL can generate programs that solve linear equations +with greater accuracy and conciseness than using reinforcement learning +systems. Additionally, we demonstrate that if we use the action spaces of +previous reinforcement learning systems as DSLs, MathDSL outperforms the +action-space-DSLs. We use DreamCoder to store equation-solving strategies as +learned abstractions in its program library and demonstrate that by using +MathDSL, these can be converted into human-interpretable solution strategies +that could have applications in mathematical education. + +
+
+
+
+
+ + ♻ ☆ Assumption violations in causal discovery and the robustness of score + matching NeurIPS + 2023 + + +
+ When domain knowledge is limited and experimentation is restricted by +ethical, financial, or time constraints, practitioners turn to observational +causal discovery methods to recover the causal structure, exploiting the +statistical properties of their data. Because causal discovery without further +assumptions is an ill-posed problem, each algorithm comes with its own set of +usually untestable assumptions, some of which are hard to meet in real +datasets. Motivated by these considerations, this paper extensively benchmarks +the empirical performance of recent causal discovery methods on observational +i.i.d. data generated under different background conditions, allowing for +violations of the critical assumptions required by each selected approach. Our +experimental findings show that score matching-based methods demonstrate +surprising performance in the false positive and false negative rate of the +inferred graph in these challenging scenarios, and we provide theoretical +insights into their performance. This work is also the first effort to +benchmark the stability of causal discovery algorithms with respect to the +values of their hyperparameters. Finally, we hope this paper will set a new +standard for the evaluation of causal discovery methods and can serve as an +accessible entry point for practitioners interested in the field, highlighting +the empirical implications of different algorithm choices. + +
+
+ comment: 37th Conference on Neural Information Processing Systems (NeurIPS + 2023) +
+
+
+
+
+ + ♻ ☆ Quantum Kernel Methods under Scrutiny: A Benchmarking Study + + +
+ Since the entry of kernel theory in the field of quantum machine learning, +quantum kernel methods (QKMs) have gained increasing attention with regard to +both probing promising applications and delivering intriguing research +insights. Two common approaches for computing the underlying Gram matrix have +emerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs). +Benchmarking these methods is crucial to gain robust insights and to understand +their practical utility. In this work, we present a comprehensive large-scale +study examining QKMs based on FQKs and PQKs across a manifold of design +choices. Our investigation encompasses both classification and regression tasks +for five dataset families and 64 datasets, systematically comparing the use of +FQKs and PQKs quantum support vector machines and kernel ridge regression. This +resulted in over 20,000 models that were trained and optimized using a +state-of-the-art hyperparameter search to ensure robust and comprehensive +insights. We delve into the importance of hyperparameters on model performance +scores and support our findings through rigorous correlation analyses. In this, +we also closely inspect two data encoding strategies. Moreover, we provide an +in-depth analysis addressing the design freedom of PQKs and explore the +underlying principles responsible for learning. Our goal is not to identify the +best-performing model for a specific task but to uncover the mechanisms that +lead to effective QKMs and reveal universal patterns. + +
+
+ comment: 18 pages main text including 12 figures and 1 table, appendix 14 + pages with 19 figures and 1 table; restructure result section and prune + appendix +
+
+
+
+
+ + ♻ ☆ Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax + Optimization ICML 2020 + + +
+ We provide a unified analysis of two-timescale gradient descent ascent +(TTGDA) for solving structured nonconvex minimax optimization problems in the +form of $\min_\textbf{x} \max_{\textbf{y} \in Y} f(\textbf{x}, \textbf{y})$, +where the objective function $f(\textbf{x}, \textbf{y})$ is nonconvex in +$\textbf{x}$ and concave in $\textbf{y}$, and the constraint set $Y \subseteq +\mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the +single-timescale gradient descent ascent (GDA) algorithm is widely used in +applications and has been shown to have strong convergence guarantees. In more +general settings, however, it can fail to converge. Our contribution is to +design TTGDA algorithms that are effective beyond the convex-concave setting, +efficiently finding a stationary point of the function $\Phi(\cdot) := +\max_{\textbf{y} \in Y} f(\cdot, \textbf{y})$. We also establish theoretical +bounds on the complexity of solving both smooth and nonsmooth nonconvex-concave +minimax optimization problems. To the best of our knowledge, this is the first +systematic analysis of TTGDA for nonconvex minimax optimization, shedding light +on its superior performance in training generative adversarial networks (GANs) +and in other real-world application problems. + +
+
+ comment: A preliminary version [arXiv:1906.00331] of this paper, with a subset + of the results that are presented here, was presented at ICML 2020; 44 Pages, + 10 Figures +
+
+
+
+
+ + ♻ ☆ Ascend HiFloat8 Format for Deep Learning + + +
+ This preliminary white paper proposes a novel 8-bit floating-point data +format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered +precision. For normal value encoding, it provides 7 exponent values with 3-bit +mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with +1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 +extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). +Meanwhile, HiF8 encodes all the special values except that positive zero and +negative zero are represented by only one bit-pattern. Thanks to the better +balance between precision and dynamic range, HiF8 can be simultaneously used in +both forward and backward passes of AI training. In this paper, we will +describe the definition and rounding methods of HiF8, as well as the tentative +training and inference solutions. To demonstrate the efficacy of HiF8, massive +simulation results on various neural networks, including traditional neural +networks and large language models (LLMs), will also be presented. + +
+
+ comment: 13 Pages, 4 Figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) is crucial in various fields such as materials +science, chemistry, and pharmacology to name a few. Conventional MD software +struggles with the balance between time cost and prediction accuracy, which +restricts its wider application. Recently, data-driven approaches based on deep +generative models have been devised for time-coarsened dynamics, which aim at +learning dynamics of diverse molecular systems over a long timestep, enjoying +both universality and efficiency. Nevertheless, most current methods are +designed solely to learn from the data distribution regardless of the +underlying Boltzmann distribution, and the physics priors such as energies and +forces are constantly overlooked. In this work, we propose a conditional +generative model called Force-guided Bridge Matching (FBM), which learns +full-atom time-coarsened dynamics and targets the Boltzmann-constrained +distribution. With the guidance of our delicately-designed intermediate force +field, FBM leverages favourable physics priors into the generation process, +giving rise to enhanced simulations. Experiments on two datasets consisting of +peptides verify our superiority in terms of comprehensive metrics and +demonstrate transferability to unseen systems. + +
+
+
+
+
+ + ♻ ☆ MLPs Learn In-Context on Regression and Classification Tasks + + +
+ In-context learning (ICL), the remarkable ability to solve a task from only +input exemplars, is often assumed to be a unique hallmark of Transformer +models. By examining commonly employed synthetic ICL tasks, we demonstrate that +multi-layer perceptrons (MLPs) can also learn in-context. Moreover, MLPs, and +the closely related MLP-Mixer models, learn in-context competitively with +Transformers given the same compute budget in this setting. We further show +that MLPs outperform Transformers on a series of classical tasks from +psychology designed to test relational reasoning, which are closely related to +in-context classification. These results underscore a need for studying +in-context learning beyond attention-based architectures, while also +challenging strong prior arguments about MLPs' limited ability to solve +relational tasks. Altogether, our results highlight the unexpected competence +of MLPs, and support the growing interest in all-MLP alternatives to +task-specific architectures. + +
+
+ comment: 30 pages, 10 figures, code available at + https://github.com/wtong98/mlp-icl +
+
+
+
+
+ + ♻ ☆ A Stochastic Quasi-Newton Method for Non-convex Optimization with + Non-uniform Smoothness + + +
+ Classical convergence analyses for optimization algorithms rely on the +widely-adopted uniform smoothness assumption. However, recent experimental +studies have demonstrated that many machine learning problems exhibit +non-uniform smoothness, meaning the smoothness factor is a function of the +model parameter instead of a universal constant. In particular, it has been +observed that the smoothness grows with respect to the gradient norm along the +training trajectory. Motivated by this phenomenon, the recently introduced +$(L_0, L_1)$-smoothness is a more general notion, compared to traditional +$L$-smoothness, that captures such positive relationship between smoothness and +gradient norm. Under this type of non-uniform smoothness, existing literature +has designed stochastic first-order algorithms by utilizing gradient clipping +techniques to obtain the optimal $\mathcal{O}(\epsilon^{-3})$ sample complexity +for finding an $\epsilon$-approximate first-order stationary solution. +Nevertheless, the studies of quasi-Newton methods are still lacking. +Considering higher accuracy and more robustness for quasi-Newton methods, in +this paper we propose a fast stochastic quasi-Newton method when there exists +non-uniformity in smoothness. Leveraging gradient clipping and variance +reduction, our algorithm can achieve the best-known +$\mathcal{O}(\epsilon^{-3})$ sample complexity and enjoys convergence speedup +with simple hyperparameter tuning. Our numerical experiments show that our +proposed algorithm outperforms the state-of-the-art approaches. + +
+
+ comment: Paper accepted by CDC 2024 +
+
+
+
+
+ + ♻ ☆ Message-Passing Monte Carlo: Generating low-discrepancy point sets via + Graph Neural Networks + + +
+ Discrepancy is a well-known measure for the irregularity of the distribution +of a point set. Point sets with small discrepancy are called low-discrepancy +and are known to efficiently fill the space in a uniform manner. +Low-discrepancy points play a central role in many problems in science and +engineering, including numerical integration, computer vision, machine +perception, computer graphics, machine learning, and simulation. In this work, +we present the first machine learning approach to generate a new class of +low-discrepancy point sets named Message-Passing Monte Carlo (MPMC) points. +Motivated by the geometric nature of generating low-discrepancy point sets, we +leverage tools from Geometric Deep Learning and base our model on Graph Neural +Networks. We further provide an extension of our framework to higher +dimensions, which flexibly allows the generation of custom-made points that +emphasize the uniformity in specific dimensions that are primarily important +for the particular problem at hand. Finally, we demonstrate that our proposed +model achieves state-of-the-art performance superior to previous methods by a +significant margin. In fact, MPMC points are empirically shown to be either +optimal or near-optimal with respect to the discrepancy for low dimension and +small number of points, i.e., for which the optimal discrepancy can be +determined. Code for generating MPMC points can be found at +https://github.com/tk-rusch/MPMC. + +
+
+ comment: Published in Proceedings of the National Academy of Sciences (PNAS): + https://www.pnas.org/doi/10.1073/pnas.2409913121 +
+
+
+
+
+ + ♻ ☆ TabGraphs: A Benchmark and Strong Baselines for Learning on Graphs with + Tabular Node Features + + +
+ Tabular machine learning is an important field for industry and science. In +this field, table rows are usually treated as independent data samples, but +additional information about relations between them is sometimes available and +can be used to improve predictive performance. Such information can be +naturally modeled with a graph, thus tabular machine learning may benefit from +graph machine learning methods. However, graph machine learning models are +typically evaluated on datasets with homogeneous node features, which have +little in common with heterogeneous mixtures of numerical and categorical +features present in tabular datasets. Thus, there is a critical difference +between the data used in tabular and graph machine learning studies, which does +not allow one to understand how successfully graph models can be transferred to +tabular data. To bridge this gap, we propose a new benchmark of diverse graphs +with heterogeneous tabular node features and realistic prediction tasks. We use +this benchmark to evaluate a vast set of models, including simple methods +previously overlooked in the literature. Our experiments show that graph neural +networks (GNNs) can indeed often bring gains in predictive performance for +tabular data, but standard tabular models also can be adapted to work with +graph data by using simple feature preprocessing, which sometimes enables them +to compete with and even outperform GNNs. Based on our empirical study, we +provide insights for researchers and practitioners in both tabular and graph +machine learning fields. + +
+
+
+
+
+ + ♻ ☆ Unraveling Anomalies in Time: Unsupervised Discovery and Isolation of + Anomalous Behavior in Bio-regenerative Life Support System Telemetry ECML + + +
+ The detection of abnormal or critical system states is essential in condition +monitoring. While much attention is given to promptly identifying anomalies, a +retrospective analysis of these anomalies can significantly enhance our +comprehension of the underlying causes of observed undesired behavior. This +aspect becomes particularly critical when the monitored system is deployed in a +vital environment. In this study, we delve into anomalies within the domain of +Bio-Regenerative Life Support Systems (BLSS) for space exploration and analyze +anomalies found in telemetry data stemming from the EDEN ISS space greenhouse +in Antarctica. We employ time series clustering on anomaly detection results to +categorize various types of anomalies in both uni- and multivariate settings. +We then assess the effectiveness of these methods in identifying systematic +anomalous behavior. Additionally, we illustrate that the anomaly detection +methods MDI and DAMP produce complementary results, as previously indicated by +research. + +
+
+ comment: 12 pages, + Supplemental Materials, Published at Machine Learning and + Knowledge Discovery in Databases. Applied Data Science Track. ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Evaluating API-oriented Code Generation in + Large Language Models + + +
+ Large language models (LLMs) like GitHub Copilot and ChatGPT have emerged as +powerful tools for code generation, significantly enhancing productivity and +accelerating software development. However, existing benchmarks primarily focus +on general code generation without considering API-oriented code generation, +i.e., generating code that invokes APIs from specific libraries. Given the +growing demand for API-oriented code generation, there is a pressing need for a +systematic and automated approach to evaluate LLM on API-oriented code +generation. To address this gap, we propose AutoAPIEval, a lightweight and +automated framework designed to evaluate the capabilities of LLMs in +API-oriented code generation. Our framework works with any library that +provides API documentation and focuses on two unit tasks: API recommendation +and code example generation, along with four metrics to evaluate the generated +APIs and code examples, such as the proportion of incorrect API recommendations +for Task 1, and the proportion of code examples where no specific API is +invoked and uncompilable/unexecutable code examples for Task 2. In addition, we +conducted a case study on three LLMs (ChatGPT, MagiCoder, and DeepSeek Coder) +and Java Runtime Environment 8 to demonstrate the framework's effectiveness. +Our findings reveal substantial variability in LLM performance across tasks, +with ChatGPT adhering better to instructions, while sharing similar +effectiveness in code example generation with its counterparts (i.e., MagiCoder +and DeekSeek Coder). We also identify key factors associated with code quality, +such as API popularity and model confidence, and build classifiers that achieve +high accuracy in detecting incorrect API recommendations and erroneous code +examples. Retrieval-augmented generation enhances the quality of code generated +by LLMs, though its effectiveness varies across different LLMs. + +
+
+
+
+
+ + ♻ ☆ Machine Learning for Two-Sample Testing under Right-Censored Data: A + Simulation Study + + +
+ The focus of this study is to evaluate the effectiveness of Machine Learning +(ML) methods for two-sample testing with right-censored observations. To +achieve this, we develop several ML-based methods with varying architectures +and implement them as two-sample tests. Each method is an ensemble (stacking) +that combines predictions from classical two-sample tests. This paper presents +the results of training the proposed ML methods, examines their statistical +power compared to classical two-sample tests, analyzes the null distribution of +the proposed methods when the null hypothesis is true, and evaluates the +significance of the features incorporated into the proposed methods. In total, +this work covers 18 methods for two-sample testing under right-censored +observations, including the proposed methods and classical well-studied +two-sample tests. All results from numerical experiments were obtained from a +synthetic dataset generated using the inverse transform sampling method and +replicated multiple times through Monte Carlo simulation. To test the +two-sample problem with right-censored observations, one can use the proposed +two-sample methods (scripts, dataset, and models are available on GitHub and +Hugging Face). + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Faster Randomized Methods for Orthogonality Constrained Problems + + +
+ Recent literature has advocated the use of randomized methods for +accelerating the solution of various matrix problems arising throughout data +science and computational science. One popular strategy for leveraging +randomization is to use it as a way to reduce problem size. However, methods +based on this strategy lack sufficient accuracy for some applications. +Randomized preconditioning is another approach for leveraging randomization, +which provides higher accuracy. The main challenge in using randomized +preconditioning is the need for an underlying iterative method, thus randomized +preconditioning so far have been applied almost exclusively to solving +regression problems and linear systems. In this article, we show how to expand +the application of randomized preconditioning to another important set of +problems prevalent across data science: optimization problems with +(generalized) orthogonality constraints. We demonstrate our approach, which is +based on the framework of Riemannian optimization and Riemannian +preconditioning, on the problem of computing the dominant canonical +correlations and on the Fisher linear discriminant analysis problem. For both +problems, we evaluate the effect of preconditioning on the computational costs +and asymptotic convergence, and demonstrate empirically the utility of our +approach. + +
+
+
+
+
+ + ♻ Discrete, compositional, and symbolic representations through attractor + dynamics + + +
+ Symbolic systems are powerful frameworks for modeling cognitive processes as +they encapsulate the rules and relationships fundamental to many aspects of +human reasoning and behavior. Central to these models are systematicity, +compositionality, and productivity, making them invaluable in both cognitive +science and artificial intelligence. However, certain limitations remain. For +instance, the integration of structured symbolic processes and latent +sub-symbolic processes has been implemented at the computational level through +fiat methods such as quantization or softmax sampling, which assume, rather +than derive, the operations underpinning discretization and symbolicization. In +this work, we introduce a novel neural stochastic dynamical systems model that +integrates attractor dynamics with symbolic representations to model cognitive +processes akin to the probabilistic language of thought (PLoT). Our model +segments the continuous representational space into discrete basins, with +attractor states corresponding to symbolic sequences, that reflect the +semanticity and compositionality characteristic of symbolic systems through +unsupervised learning, rather than relying on pre-defined primitives. Moreover, +like PLoT, our model learns to sample a diverse distribution of attractor +states that reflect the mutual information between the input data and the +symbolic encodings. This approach establishes a unified framework that +integrates both symbolic and sub-symbolic processing through neural dynamics, a +neuro-plausible substrate with proven expressivity in AI, offering a more +comprehensive model that mirrors the complex duality of cognitive operations. + +
+
+
+
+
+ + ♻ ☆ ZSC-Eval: An Evaluation Toolkit and Benchmark for Multi-agent Zero-shot + Coordination NeurIPS 2024 + + +
+ Zero-shot coordination (ZSC) is a new cooperative multi-agent reinforcement +learning (MARL) challenge that aims to train an ego agent to work with diverse, +unseen partners during deployment. The significant difference between the +deployment-time partners' distribution and the training partners' distribution +determined by the training algorithm makes ZSC a unique out-of-distribution +(OOD) generalization challenge. The potential distribution gap between +evaluation and deployment-time partners leads to inadequate evaluation, which +is exacerbated by the lack of appropriate evaluation metrics. In this paper, we +present ZSC-Eval, the first evaluation toolkit and benchmark for ZSC +algorithms. ZSC-Eval consists of: 1) Generation of evaluation partner +candidates through behavior-preferring rewards to approximate deployment-time +partners' distribution; 2) Selection of evaluation partners by Best-Response +Diversity (BR-Div); 3) Measurement of generalization performance with various +evaluation partners via the Best-Response Proximity (BR-Prox) metric. We use +ZSC-Eval to benchmark ZSC algorithms in Overcooked and Google Research Football +environments and get novel empirical findings. We also conduct a human +experiment of current ZSC algorithms to verify the ZSC-Eval's consistency with +human evaluation. ZSC-Eval is now available at +https://github.com/sjtu-marl/ZSC-Eval. + +
+
+ comment: Accepted in NeurIPS 2024 Dataset and Benchmark Track +
+
+
+
+
+ + ♻ ☆ Strategic Linear Contextual Bandits NeurIPS 2024 + + +
+ Motivated by the phenomenon of strategic agents gaming a recommender system +to maximize the number of times they are recommended to users, we study a +strategic variant of the linear contextual bandit problem, where the arms can +strategically misreport privately observed contexts to the learner. We treat +the algorithm design problem as one of mechanism design under uncertainty and +propose the Optimistic Grim Trigger Mechanism (OptGTM) that incentivizes the +agents (i.e., arms) to report their contexts truthfully while simultaneously +minimizing regret. We also show that failing to account for the strategic +nature of the agents results in linear regret. However, a trade-off between +mechanism design and regret minimization appears to be unavoidable. More +broadly, this work aims to provide insight into the intersection of online +learning and mechanism design. + +
+
+ comment: To appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion-based Generative Image Outpainting for Recovery of + FOV-Truncated CT Images + + +
+ Field-of-view (FOV) recovery of truncated chest CT scans is crucial for +accurate body composition analysis, which involves quantifying skeletal muscle +and subcutaneous adipose tissue (SAT) on CT slices. This, in turn, enables +disease prognostication. Here, we present a method for recovering truncated CT +slices using generative image outpainting. We train a diffusion model and apply +it to truncated CT slices generated by simulating a small FOV. Our model +reliably recovers the truncated anatomy and outperforms the previous +state-of-the-art despite being trained on 87% less data. + +
+
+ comment: Shared last authorship: Florian J. Fintelmann and Philip M\"uller +
+
+
+
+
+ + ♻ ☆ Characterizing stable regions in the residual stream of LLMs + + +
+ We identify "stable regions" in the residual stream of Transformers, where +the model's output remains insensitive to small activation changes, but +exhibits high sensitivity at region boundaries. These regions emerge during +training and become more defined as training progresses or model size +increases. The regions appear to be much larger than previously studied +polytopes. Our analysis suggests that these stable regions align with semantic +distinctions, where similar prompts cluster within regions, and activations +from the same region lead to similar next token predictions. This work provides +a promising research direction for understanding the complexity of neural +networks, shedding light on training dynamics, and advancing interpretability. + +
+
+
+
+
+ + ♻ ☆ Learning Constrained Markov Decision Processes With Non-stationary + Rewards and Constraints + + +
+ In constrained Markov decision processes (CMDPs) with adversarial rewards and +constraints, a well-known impossibility result prevents any algorithm from +attaining both sublinear regret and sublinear constraint violation, when +competing against a best-in-hindsight policy that satisfies constraints on +average. In this paper, we show that this negative result can be eased in CMDPs +with non-stationary rewards and constraints, by providing algorithms whose +performances smoothly degrade as non-stationarity increases. Specifically, we +propose algorithms attaining $\tilde{\mathcal{O}} (\sqrt{T} + C)$ regret and +positive constraint violation under bandit feedback, where $C$ is a corruption +value measuring the environment non-stationarity. This can be $\Theta(T)$ in +the worst case, coherently with the impossibility result for adversarial CMDPs. +First, we design an algorithm with the desired guarantees when $C$ is known. +Then, in the case $C$ is unknown, we show how to obtain the same results by +embedding such an algorithm in a general meta-procedure. This is of independent +interest, as it can be applied to any non-stationary constrained online +learning setting. + +
+
+
+
+
+ + ♻ ☆ Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation + + +
+ Given the high cost of collecting robotic data in the real world, sample +efficiency is a consistently compelling pursuit in robotics. In this paper, we +introduce SGRv2, an imitation learning framework that enhances sample +efficiency through improved visual and action representations. Central to the +design of SGRv2 is the incorporation of a critical inductive bias-action +locality, which posits that robot's actions are predominantly influenced by the +target object and its interactions with the local environment. Extensive +experiments in both simulated and real-world settings demonstrate that action +locality is essential for boosting sample efficiency. SGRv2 excels in RLBench +tasks with keyframe control using merely 5 demonstrations and surpasses the RVT +baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and +MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. +In real-world environments, with only eight demonstrations, SGRv2 can perform a +variety of tasks at a markedly higher success rate compared to baseline models. +Project website: http://sgrv2-robot.github.io + +
+
+ comment: CoRL 2024. Project website: http://sgrv2-robot.github.io +
+
+
+
+
+ + ♻ ☆ What happens to diffusion model likelihood when your model is + conditional? + + +
+ Diffusion Models (DMs) iteratively denoise random samples to produce +high-quality data. The iterative sampling process is derived from Stochastic +Differential Equations (SDEs), allowing a speed-quality trade-off chosen at +inference. Another advantage of sampling with differential equations is exact +likelihood computation. These likelihoods have been used to rank unconditional +DMs and for out-of-domain classification. Despite the many existing and +possible uses of DM likelihoods, the distinct properties captured are unknown, +especially in conditional contexts such as Text-To-Image (TTI) or +Text-To-Speech synthesis (TTS). Surprisingly, we find that TTS DM likelihoods +are agnostic to the text input. TTI likelihood is more expressive but cannot +discern confounding prompts. Our results show that applying DMs to conditional +tasks reveals inconsistencies and strengthens claims that the properties of DM +likelihood are unknown. This impact sheds light on the previously unknown +nature of DM likelihoods. Although conditional DMs maximise likelihood, the +likelihood in question is not as sensitive to the conditioning input as one +expects. This investigation provides a new point-of-view on diffusion +likelihoods. + +
+
+
+
+
+ + ♻ ☆ Explainable AI needs formal notions of explanation correctness + + +
+ The use of machine learning (ML) in critical domains such as medicine poses +risks and requires regulation. One requirement is that decisions of ML systems +in high-risk applications should be human-understandable. The field of +"explainable artificial intelligence" (XAI) seemingly addresses this need. +However, in its current form, XAI is unfit to provide quality control for ML; +it itself needs scrutiny. Popular XAI methods cannot reliably answer important +questions about ML models, their training data, or a given test input. We +recapitulate results demonstrating that popular XAI methods systematically +attribute importance to input features that are independent of the prediction +target. This limits their utility for purposes such as model and data +(in)validation, model improvement, and scientific discovery. We argue that the +fundamental reason for this limitation is that current XAI methods do not +address well-defined problems and are not evaluated against objective criteria +of explanation correctness. Researchers should formally define the problems +they intend to solve first and then design methods accordingly. This will lead +to notions of explanation correctness that can be theoretically verified and +objective metrics of explanation performance that can be assessed using +ground-truth data. + +
+
+
+
+
+ + ♻ ☆ Efficient Combinatorial Optimization via Heat Diffusion NeurIPS 2024 + + +
+ Combinatorial optimization problems are widespread but inherently challenging +due to their discrete nature. The primary limitation of existing methods is +that they can only access a small fraction of the solution space at each +iteration, resulting in limited efficiency for searching the global optimal. To +overcome this challenge, diverging from conventional efforts of expanding the +solver's search scope, we focus on enabling information to actively propagate +to the solver through heat diffusion. By transforming the target function while +preserving its optima, heat diffusion facilitates information flow from distant +regions to the solver, providing more efficient navigation. Utilizing heat +diffusion, we propose a framework for solving general combinatorial +optimization problems. The proposed methodology demonstrates superior +performance across a range of the most challenging and widely encountered +combinatorial optimizations. Echoing recent advancements in harnessing +thermodynamics for generative artificial intelligence, our study further +reveals its significant potential in advancing combinatorial optimization. + +
+
+ comment: After the rebuttal version for NeurIPS 2024 (poster). Code is + available in https://github.com/AwakerMhy/HeO +
+
+
+
+
+ + ♻ ☆ Learning to Receive Help: Intervention-Aware Concept Embedding Models NeurIPS 2023 + + +
+ Concept Bottleneck Models (CBMs) tackle the opacity of neural architectures +by constructing and explaining their predictions using a set of high-level +concepts. A special property of these models is that they permit concept +interventions, wherein users can correct mispredicted concepts and thus improve +the model's performance. Recent work, however, has shown that intervention +efficacy can be highly dependent on the order in which concepts are intervened +on and on the model's architecture and training hyperparameters. We argue that +this is rooted in a CBM's lack of train-time incentives for the model to be +appropriately receptive to concept interventions. To address this, we propose +Intervention-aware Concept Embedding models (IntCEMs), a novel CBM-based +architecture and training paradigm that improves a model's receptiveness to +test-time interventions. Our model learns a concept intervention policy in an +end-to-end fashion from where it can sample meaningful intervention +trajectories at train-time. This conditions IntCEMs to effectively select and +receive concept interventions when deployed at test-time. Our experiments show +that IntCEMs significantly outperform state-of-the-art concept-interpretable +models when provided with test-time concept interventions, demonstrating the +effectiveness of our approach. + +
+
+ comment: Accepted as a spotlight at the Thirty-seventh Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ Neural Exploratory Landscape Analysis + + +
+ Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that +meta-trained neural networks can effectively guide the design of black-box +optimizers, significantly reducing the need for expert tuning and delivering +robust performance across complex problem distributions. Despite their success, +a paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape +Analysis features to inform the meta-level agent about the low-level +optimization progress. To address the gap, this paper proposes Neural +Exploratory Landscape Analysis (NeurELA), a novel framework that dynamically +profiles landscape features through a two-stage, attention-based neural +network, executed in an entirely end-to-end fashion. NeurELA is pre-trained +over a variety of MetaBBO algorithms using a multi-task neuroevolution +strategy. Extensive experiments show that NeurELA achieves consistently +superior performance when integrated into different and even unseen MetaBBO +tasks and can be efficiently fine-tuned for further performance boost. This +advancement marks a pivotal step in making MetaBBO algorithms more autonomous +and broadly applicable.The source code of NeurELA can be accessed at +https://anonymous.4open.science/r/Neur-ELA-303C. + +
+
+
+
+
+ + ♻ ☆ Unsupervisedly Learned Representations: Should the Quest be Over? + + +
+ After four decades of research there still exists a Classification accuracy +gap of about 20% between our best Unsupervisedly Learned Representations +methods and the accuracy rates achieved by intelligent animals. It thus may +well be that we are looking in the wrong direction. A possible solution to this +puzzle is presented. We demonstrate that Reinforcement Learning can learn +representations which achieve the same accuracy as that of animals. Our main +modest contribution lies in the observations that: a. when applied to a real +world environment Reinforcement Learning does not require labels, and thus may +be legitimately considered as Unsupervised Learning, and b. in contrast, when +Reinforcement Learning is applied in a simulated environment it does inherently +require labels and should thus be generally be considered as Supervised +Learning. The corollary of these observations is that further search for +Unsupervised Learning competitive paradigms which may be trained in simulated +environments may be futile. + +
+
+ comment: To be published at The 6th International Conference on Machine + Learning, Optimization and Data Science - LOD 2020 +
+
+
+
+
+ + ♻ ☆ Exploring Selective Layer Fine-Tuning in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for fine-tuning +foundation models using distributed data in a privacy-preserving manner. Under +limited computational resources, clients often find it more practical to +fine-tune a selected subset of layers, rather than the entire model, based on +their task-specific data. In this study, we provide a thorough theoretical +exploration of selective layer fine-tuning in FL, emphasizing a flexible +approach that allows the clients to adjust their selected layers according to +their local data and resources. We theoretically demonstrate that the layer +selection strategy has a significant impact on model convergence in two +critical aspects: the importance of selected layers and the heterogeneous +choices across clients. Drawing from these insights, we further propose a +strategic layer selection method that utilizes local gradients and regulates +layer selections across clients. The extensive experiments on both image and +text datasets demonstrate the effectiveness of the proposed strategy compared +with several baselines, highlighting its advances in identifying critical +layers that adapt to the client heterogeneity and training dynamics in FL. + +
+
+
+
+
+ + ♻ ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing sequences of interactions between users and items, sequential +recommendation models can learn user intent and make predictions about the next +item. Next to item interactions, most systems also have interactions with what +we call non-item pages: these pages are not related to specific items but still +can provide insights of the user's interests, as, for example, navigation +pages. + We therefore propose a general way to include these non-item pages in +sequential recommendation models to enhance next-item prediction. First, we +demonstrate the influence of non-item pages on following interactions with the +hypotheses testing framework HypTrails and propose methods for representing +non-item pages in sequential recommendation models. Subsequently, we adapt +popular sequential recommender models to integrate non-item pages and +investigate their performance with different item representation strategies as +well as their ability to handle noisy data. To show the general capabilities of +the models to integrate non-item pages, we create a synthetic dataset for a +controlled setting and then evaluate the improvements from including non-item +pages on two real-world datasets. + Our results show that non-item pages are a valuable source of information, +and incorporating them in sequential recommendation models increases the +performance of next-item prediction across all analyzed model architectures. + +
+
+ comment: 37 pages, 19 figures; Submitted to ACM TORS +
+
+
+
+
+ + ♻ ☆ On the Design and Analysis of LLM-Based Algorithms + + +
+ We initiate a formal investigation into the design and analysis of LLM-based +algorithms, i.e. algorithms that contain one or multiple calls of large +language models (LLMs) as sub-routines and critically rely on the capabilities +of LLMs. While LLM-based algorithms, ranging from basic LLM calls with prompt +engineering to complicated LLM-powered agent systems and compound AI systems, +have achieved remarkable empirical success, the design and optimization of them +have mostly relied on heuristics and trial-and-errors, which is largely due to +a lack of formal and analytical study for these algorithms. To fill this gap, +we start by identifying the computational-graph representation of LLM-based +algorithms, the design principle of task decomposition, and some key +abstractions, which then facilitate our formal analysis for the accuracy and +efficiency of LLM-based algorithms, despite the black-box nature of LLMs. +Through extensive analytical and empirical investigation in a series of case +studies, we demonstrate that the proposed framework is broadly applicable to a +wide range of scenarios and diverse patterns of LLM-based algorithms, such as +parallel, hierarchical and recursive task decomposition. Our proposed framework +holds promise for advancing LLM-based algorithms, by revealing the reasons +behind curious empirical phenomena, guiding the choices of hyperparameters, +predicting the empirical performance of algorithms, and inspiring new algorithm +design. To promote further study of LLM-based algorithms, we release our source +code at +https://github.com/modelscope/agentscope/tree/main/examples/paper_llm_based_algorithm. + +
+
+
+
+
+ + ♻ ☆ dlordinal: a Python package for deep ordinal classification + + +
+ dlordinal is a new Python library that unifies many recent deep ordinal +classification methodologies available in the literature. Developed using +PyTorch as underlying framework, it implements the top performing +state-of-the-art deep learning techniques for ordinal classification problems. +Ordinal approaches are designed to leverage the ordering information present in +the target variable. Specifically, it includes loss functions, various output +layers, dropout techniques, soft labelling methodologies, and other +classification strategies, all of which are appropriately designed to +incorporate the ordinal information. Furthermore, as the performance metrics to +assess novel proposals in ordinal classification depend on the distance between +target and predicted classes in the ordinal scale, suitable ordinal evaluation +metrics are also included. dlordinal is distributed under the BSD-3-Clause +license and is available at https://github.com/ayrna/dlordinal. + +
+
+
+
+
+ + ♻ ☆ Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable + Tensor Collections SOSP24 + + +
+ Deep learning (DL) jobs use multi-dimensional parallelism, i.e. combining +data, model, and pipeline parallelism, to use large GPU clusters efficiently. +Long-running jobs may experience changes to their GPU allocation: (i) resource +elasticity during training adds or removes GPUs; (ii) hardware maintenance may +require redeployment on different GPUs; and (iii) GPU failures force jobs to +run with fewer devices. Current DL frameworks tie jobs to a set of GPUs and +thus lack support for these scenarios. In particular, they cannot change the +multi-dimensional parallelism of an already-running job in an efficient and +model-independent way. + We describe Scalai, a state management library for DL systems that enables +jobs to change their parallelism dynamically after the GPU allocation is +updated at runtime. Scalai achieves this through a new abstraction, a +parallelizable tensor collection (PTC), that externalizes the job state during +training. After a GPU change, Scalai uses the PTC to transform the job state: +the PTC repartitions the dataset state under data parallelism and exposes it to +DL workers through a virtual file system; and the PTC obtains the model state +as partitioned checkpoints and transforms them to reflect the new +parallelization configuration. For efficiency, Scalai executes PTC +transformations in parallel with minimum data movement between workers. Our +experiments show that Scalai enables DL jobs to support dynamic parallelization +with low overhead. + +
+
+ comment: The 30th Symposium on Operating Systems Principles (SOSP24) +
+
+
+
+
+ + ♻ ☆ IDP-PGFE: An Interpretable Disruption Predictor based on Physics-Guided + Feature Extraction + + +
+ Disruption prediction has made rapid progress in recent years, especially in +machine learning (ML)-based methods. Understanding why a predictor makes a +certain prediction can be as crucial as the prediction's accuracy for future +tokamak disruption predictors. The purpose of most disruption predictors is +accuracy or cross-machine capability. However, if a disruption prediction model +can be interpreted, it can tell why certain samples are classified as +disruption precursors. This allows us to tell the types of incoming disruption +and gives us insight into the mechanism of disruption. This paper designs a +disruption predictor called Interpretable Disruption Predictor based On +Physics-guided feature extraction (IDP-PGFE) on J-TEXT. The prediction +performance of the model is effectively improved by extracting physics-guided +features. A high-performance model is required to ensure the validity of the +interpretation results. The interpretability study of IDP-PGFE provides an +understanding of J-TEXT disruption and is generally consistent with existing +comprehension of disruption. IDP-PGFE has been applied to the disruption due to +continuously increasing density towards density limit experiments on J-TEXT. +The time evolution of the PGFE features contribution demonstrates that the +application of ECRH triggers radiation-caused disruption, which lowers the +density at disruption. While the application of RMP indeed raises the density +limit in J-TEXT. The interpretability study guides intuition on the physical +mechanisms of density limit disruption that RMPs affect not only the MHD +instabilities but also the radiation profile, which delays density limit +disruption. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous + Federated Learning Framework + + +
+ Traditional federated learning (FL) frameworks rely heavily on terrestrial +networks, where coverage limitations and increasing bandwidth congestion +significantly hinder model convergence. Fortunately, the advancement of +low-Earth orbit (LEO) satellite networks offers promising new communication +avenues to augment traditional terrestrial FL. Despite this potential, the +limited satellite-ground communication bandwidth and the heterogeneous +operating environments of ground devices-including variations in data, +bandwidth, and computing power-pose substantial challenges for effective and +robust satellite-assisted FL. To address these challenges, we propose SatFed, a +resource-efficient satellite-assisted heterogeneous FL framework. SatFed +implements freshness-based model prioritization queues to optimize the use of +highly constrained satellite-ground bandwidth, ensuring the transmission of the +most critical models. Additionally, a multigraph is constructed to capture +real-time heterogeneous relationships between devices, including data +distribution, terrestrial bandwidth, and computing capability. This multigraph +enables SatFed to aggregate satellite-transmitted models into peer guidance, +enhancing local training in heterogeneous environments. Extensive experiments +with real-world LEO satellite networks demonstrate that SatFed achieves +superior performance and robustness compared to state-of-the-art benchmarks. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ VARADE: a Variational-based AutoRegressive model for Anomaly Detection + on the Edge + + +
+ Detecting complex anomalies on massive amounts of data is a crucial task in +Industry 4.0, best addressed by deep learning. However, available solutions are +computationally demanding, requiring cloud architectures prone to latency and +bandwidth issues. This work presents VARADE, a novel solution implementing a +light autoregressive framework based on variational inference, which is best +suited for real-time execution on the edge. The proposed approach was validated +on a robotic arm, part of a pilot production line, and compared with several +state-of-the-art algorithms, obtaining the best trade-off between anomaly +detection accuracy, power consumption and inference frequency on two different +edge platforms. + +
+
+
+
+
+ + ♻ ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems play a crucial role in achieving Sustainable +Development Goals by collecting and analyzing vital global data through +satellite networks. These systems are essential for tasks like mapping, +disaster monitoring, and resource management, but they face challenges in +processing and transmitting large volumes of EO data, especially in specialized +fields such as agriculture and real-time disaster response. Domain-adapted +Large Language Models (LLMs) provide a promising solution by facilitating data +fusion between extensive EO data and semantic EO data. By improving integration +and interpretation of diverse datasets, LLMs address the challenges of +processing specialized information in agriculture and disaster response +applications. This fusion enhances the accuracy and relevance of transmitted +data. This paper presents a framework for semantic communication in EO +satellite networks, aimed at improving data transmission efficiency and overall +system performance through cognitive processing techniques. The proposed system +employs Discrete-Task-Oriented Source-Channel Coding (DT-JSCC) and Semantic +Data Augmentation (SA) to focus on relevant information while minimizing +communication overhead. By integrating cognitive semantic processing and +inter-satellite links, the framework enhances the analysis and transmission of +multispectral satellite imagery, improving object detection, pattern +recognition, and real-time decision-making. The introduction of Cognitive +Semantic Augmentation (CSA) allows satellites to process and transmit semantic +information, boosting adaptability to changing environments and application +needs. This end-to-end architecture is tailored for next-generation satellite +networks, such as those supporting 6G, and demonstrates significant +improvements in efficiency and accuracy. + +
+
+ comment: 18 pages, 10 figures, magazine +
+
+
+
+
+ + ♻ ☆ Realising Synthetic Active Inference Agents, Part II: Variational + Message Updates + + +
+ The Free Energy Principle (FEP) describes (biological) agents as minimising a +variational Free Energy (FE) with respect to a generative model of their +environment. Active Inference (AIF) is a corollary of the FEP that describes +how agents explore and exploit their environment by minimising an expected FE +objective. In two related papers, we describe a scalable, epistemic approach to +synthetic AIF, by message passing on free-form Forney-style Factor Graphs +(FFGs). A companion paper (part I) introduces a Constrained FFG (CFFG) notation +that visually represents (generalised) FE objectives for AIF. The current paper +(part II) derives message passing algorithms that minimise (generalised) FE +objectives on a CFFG by variational calculus. A comparison between simulated +Bethe and generalised FE agents illustrates how the message passing approach to +synthetic AIF induces epistemic behaviour on a T-maze navigation task. +Extension of the T-maze simulation to 1) learning goal statistics, and 2) a +multi-agent bargaining setting, illustrate how this approach encourages reuse +of nodes and updates in alternative settings. With a full message passing +account of synthetic AIF agents, it becomes possible to derive and reuse +message updates across models and move closer to industrial applications of +synthetic AIF. + +
+
+
+
+
+ + ♻ ☆ EPTQ: Enhanced Post-Training Quantization via Hessian-guided + Network-wise Optimization + + +
+ Quantization is a key method for deploying deep neural networks on edge +devices with limited memory and computation resources. Recent improvements in +Post-Training Quantization (PTQ) methods were achieved by an additional local +optimization process for learning the weight quantization rounding policy. +However, a gap exists when employing network-wise optimization with small +representative datasets. In this paper, we propose a new method for enhanced +PTQ (EPTQ) that employs a network-wise quantization optimization process, which +benefits from considering cross-layer dependencies during optimization. EPTQ +enables network-wise optimization with a small representative dataset using a +novel sample-layer attention score based on a label-free Hessian matrix upper +bound. The label-free approach makes our method suitable for the PTQ scheme. We +give a theoretical analysis for the said bound and use it to construct a +knowledge distillation loss that guides the optimization to focus on the more +sensitive layers and samples. In addition, we leverage the Hessian upper bound +to improve the weight quantization parameters selection by focusing on the more +sensitive elements in the weight tensors. Empirically, by employing EPTQ we +achieve state-of-the-art results on various models, tasks, and datasets, +including ImageNet classification, COCO object detection, and Pascal-VOC for +semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Recurrent Stochastic Configuration Networks for Temporal Data Analytics + + +
+ Temporal data modelling techniques with neural networks are useful in many +domain applications, including time-series forecasting and control engineering. +This paper aims at developing a recurrent version of stochastic configuration +networks (RSCNs) for problem solving, where we have no underlying assumption on +the dynamic orders of the input variables. Given a collection of historical +data, we first build an initial RSCN model in the light of a supervisory +mechanism, followed by an online update of the output weights by using a +projection algorithm. Some theoretical results are established, including the +echo state property, the universal approximation property of RSCNs for both the +offline and online learnings, and the convergence of the output weights. The +proposed RSCN model is remarkably distinguished from the well-known echo state +networks (ESNs) in terms of the way of assigning the input random weight matrix +and a special structure of the random feedback matrix. A comprehensive +comparison study among the long short-term memory (LSTM) network, the original +ESN, and several state-of-the-art ESN methods such as the simple cycle +reservoir (SCR), the polynomial ESN (PESN), the leaky-integrator ESN (LIESN) +and RSCN is carried out. Numerical results clearly indicate that the proposed +RSCN performs favourably over all of the datasets. + +
+
+
+
+
+ + ♻ ☆ Archon: An Architecture Search Framework for Inference-Time Techniques + + +
+ Inference-time techniques are emerging as highly effective tools to increase +large language model (LLM) capabilities. However, there is still limited +understanding of the best practices for developing systems that combine +inference-time techniques with one or more LLMs, with challenges including: (1) +effectively allocating inference compute budget, (2) understanding the +interactions between different combinations of inference-time techniques and +their impact on downstream performance, and 3) efficiently searching over the +large space of model choices, inference-time techniques, and their +compositions. To address these challenges, we introduce Archon, an automated +framework for designing inference-time architectures. Archon defines an +extensible design space, encompassing methods such as generation ensembling, +multi-sampling, ranking, fusion, critiquing, verification, and unit testing. It +then transforms the problem of selecting and combining LLMs and inference-time +techniques into a hyperparameter optimization objective. To optimize this +objective, we introduce automated Inference-Time Architecture Search (ITAS) +algorithms. Given target benchmark(s), an inference compute budget, and +available LLMs, ITAS outputs optimized architectures. We evaluate Archon +architectures across a wide range of instruction-following and reasoning +benchmarks, including MT-Bench, Arena-Hard-Auto, AlpacaEval 2.0, MixEval, +MixEval Hard, MATH, and CodeContests. We show that automatically designed +inference-time architectures by Archon outperform strong models such as GPT-4o +and Claude 3.5 Sonnet on these benchmarks, achieving an average increase of +15.1 and 11.2 percentage points with all-source models and open-source models, +respectively. We make our code and datasets available publicly on Github: +https://github.com/ScalingIntelligence/Archon. + +
+
+
+
+
+ + ♻ ☆ Quality Matters: Evaluating Synthetic Data for Tool-Using LLMs + + +
+ Training large language models (LLMs) for external tool usage is a rapidly +expanding field, with recent research focusing on generating synthetic data to +address the shortage of available data. However, the absence of systematic data +quality checks poses complications for properly training and testing models. To +that end, we propose two approaches for assessing the reliability of data for +training LLMs to use external tools. The first approach uses intuitive, +human-defined correctness criteria. The second approach uses a model-driven +assessment with in-context evaluation. We conduct a thorough evaluation of data +quality on two popular benchmarks, followed by an extrinsic evaluation that +showcases the impact of data quality on model performance. Our results +demonstrate that models trained on high-quality data outperform those trained +on unvalidated data, even when trained with a smaller quantity of data. These +findings empirically support the significance of assessing and ensuring the +reliability of training data for tool-using LLMs. + +
+
+
+
+
+ + ♻ ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ♻ ☆ Improving Fast Adversarial Training Paradigm: An Example Taxonomy + Perspective + + +
+ While adversarial training is an effective defense method against adversarial +attacks, it notably increases the training cost. To this end, fast adversarial +training (FAT) is presented for efficient training and has become a hot +research topic. However, FAT suffers from catastrophic overfitting, which leads +to a performance drop compared with multi-step adversarial training. However, +the cause of catastrophic overfitting remains unclear and lacks exploration. In +this paper, we present an example taxonomy in FAT, which identifies that +catastrophic overfitting is caused by the imbalance between the inner and outer +optimization in FAT. Furthermore, we investigated the impact of varying degrees +of training loss, revealing a correlation between training loss and +catastrophic overfitting. Based on these observations, we redesign the loss +function in FAT with the proposed dynamic label relaxation to concentrate the +loss range and reduce the impact of misclassified examples. Meanwhile, we +introduce batch momentum initialization to enhance the diversity to prevent +catastrophic overfitting in an efficient manner. Furthermore, we also propose +Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate +training strategy for examples based on their loss degree. Our proposed method, +named example taxonomy aware FAT (ETA), establishes an improved paradigm for +FAT. Experiment results demonstrate our ETA achieves state-of-the-art +performance. Comprehensive experiments on four standard datasets demonstrate +the competitiveness of our proposed method. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine + Learning + + +
+ Glycans are basic biomolecules and perform essential functions within living +organisms. The rapid increase of functional glycan data provides a good +opportunity for machine learning solutions to glycan understanding. However, +there still lacks a standard machine learning benchmark for glycan function +prediction. In this work, we fill this blank by building a comprehensive +benchmark for Glycan Machine Learning (GlycanML). The GlycanML benchmark +consists of diverse types of tasks including glycan taxonomy prediction, glycan +immunogenicity prediction, glycosylation type prediction, and protein-glycan +interaction prediction. Glycans can be represented by both sequences and graphs +in GlycanML, which enables us to extensively evaluate sequence-based models and +graph neural networks (GNNs) on benchmark tasks. Furthermore, by concurrently +performing eight glycan taxonomy prediction tasks, we introduce the +GlycanML-MTL testbed for multi-task learning (MTL) algorithms. Experimental +results show the superiority of modeling glycans with multi-relational GNNs, +and suitable MTL methods can further boost model performance. We provide all +datasets and source codes at https://github.com/GlycanML/GlycanML and maintain +a leaderboard at https://GlycanML.github.io/project + +
+
+ comment: Research project paper. All code and data are released +
+
+
+
+
+ + ♻ ☆ Understanding the Expressivity and Trainability of Fourier Neural + Operator: A Mean-Field Perspective + + +
+ In this paper, we explores the expressivity and trainability of the Fourier +Neural Operator (FNO). We establish a mean-field theory for the FNO, analyzing +the behavior of the random FNO from an edge of chaos perspective. Our +investigation into the expressivity of a random FNO involves examining the +ordered-chaos phase transition of the network based on the weight distribution. +This phase transition demonstrates characteristics unique to the FNO, induced +by mode truncation, while also showcasing similarities to those of densely +connected networks. Furthermore, we identify a connection between expressivity +and trainability: the ordered and chaotic phases correspond to regions of +vanishing and exploding gradients, respectively. This finding provides a +practical prerequisite for the stable training of the FNO. Our experimental +results corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ INT-FlashAttention: Enabling Flash Attention for INT8 Quantization + + +
+ As the foundation of large language models (LLMs), self-attention module +faces the challenge of quadratic time and memory complexity with respect to +sequence length. FlashAttention accelerates attention computation and reduces +its memory usage by leveraging the GPU memory hierarchy. A promising research +direction is to integrate FlashAttention with quantization methods. This paper +introduces INT-FlashAttention, the first INT8 quantization architecture +compatible with the forward workflow of FlashAttention, which significantly +improves the inference speed of FlashAttention on Ampere GPUs. We implement our +INT-FlashAttention prototype with fully INT8 activations and general +matrix-multiplication (GEMM) kernels, making it the first attention operator +with fully INT8 input. As a general token-level post-training quantization +framework, INT-FlashAttention is also compatible with other data formats like +INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster +inference speed and 82% smaller quantization error compared to standard +FlashAttention with FP16 and FP8 data format. + +
+
+
+
+
+ + ♻ ☆ Bivariate DeepKriging for Large-scale Spatial Interpolation of Wind + Fields + + +
+ High spatial resolution wind data are essential for a wide range of +applications in climate, oceanographic and meteorological studies. Large-scale +spatial interpolation or downscaling of bivariate wind fields having velocity +in two dimensions is a challenging task because wind data tend to be +non-Gaussian with high spatial variability and heterogeneity. In spatial +statistics, cokriging is commonly used for predicting bivariate spatial fields. +However, the cokriging predictor is not optimal except for Gaussian processes. +Additionally, cokriging is computationally prohibitive for large datasets. In +this paper, we propose a method, called bivariate DeepKriging, which is a +spatially dependent deep neural network (DNN) with an embedding layer +constructed by spatial radial basis functions for bivariate spatial data +prediction. We then develop a distribution-free uncertainty quantification +method based on bootstrap and ensemble DNN. Our proposed approach outperforms +the traditional cokriging predictor with commonly used covariance functions, +such as the linear model of co-regionalization and flexible bivariate Mat\'ern +covariance. We demonstrate the computational efficiency and scalability of the +proposed DNN model, with computations that are, on average, 20 times faster +than those of conventional techniques. We apply the bivariate DeepKriging +method to the wind data over the Middle East region at 506,771 locations. The +prediction performance of the proposed method is superior over the cokriging +predictors and dramatically reduces computation time. + +
+
+
+
+
+ + ♻ ☆ Learning Variable Compliance Control From a Few Demonstrations for + Bimanual Robot with Haptic Feedback Teleoperation System IROS 2024 + + +
+ Automating dexterous, contact-rich manipulation tasks using rigid robots is a +significant challenge in robotics. Rigid robots, defined by their actuation +through position commands, face issues of excessive contact forces due to their +inability to adapt to contact with the environment, potentially causing damage. +While compliance control schemes have been introduced to mitigate these issues +by controlling forces via external sensors, they are hampered by the need for +fine-tuning task-specific controller parameters. Learning from Demonstrations +(LfD) offers an intuitive alternative, allowing robots to learn manipulations +through observed actions. In this work, we introduce a novel system to enhance +the teaching of dexterous, contact-rich manipulations to rigid robots. Our +system is twofold: firstly, it incorporates a teleoperation interface utilizing +Virtual Reality (VR) controllers, designed to provide an intuitive and +cost-effective method for task demonstration with haptic feedback. Secondly, we +present Comp-ACT (Compliance Control via Action Chunking with Transformers), a +method that leverages the demonstrations to learn variable compliance control +from a few demonstrations. Our methods have been validated across various +complex contact-rich manipulation tasks using single-arm and bimanual robot +setups in simulated and real-world environments, demonstrating the +effectiveness of our system in teaching robots dexterous manipulations with +enhanced adaptability and safety. Code available at: +https://github.com/omron-sinicx/CompACT + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ Decentralised Variational Inference Frameworks for Multi-object Tracking + on Sensor Network + + +
+ This paper tackles the challenge of multi-sensor multi-object tracking by +proposing various decentralised Variational Inference (VI) schemes that match +the tracking performance of centralised sensor fusion with only local message +exchanges among neighboring sensors. We first establish a centralised VI sensor +fusion scheme as a benchmark and analyse the limitations of its decentralised +counterpart, which requires sensors to await consensus at each VI iteration. +Therefore, we propose a decentralised gradient-based VI framework that +optimises the Locally Maximised Evidence Lower Bound (LM-ELBO) instead of the +standard ELBO, which reduces the parameter search space and enables faster +convergence, making it particularly beneficial for decentralised tracking.This +proposed framework is inherently self-evolving, improving with advancements in +decentralised optimisation techniques for convergence guarantees and +efficiency. Further, we enhance the convergence speed of proposed decentralised +schemes using natural gradients and gradient tracking strategies. Results +verify that our decentralised VI schemes are empirically equivalent to +centralised fusion in tracking performance. Notably, the decentralised natural +gradient VI method is the most communication-efficient, with communication +costs comparable to suboptimal decentralised strategies while delivering +notably higher tracking accuracy. + +
+
+
+
+
+ + ♻ ☆ Trust-Region Sequential Quadratic Programming for Stochastic + Optimization with Random Models + + +
+ In this work, we consider solving optimization problems with a stochastic +objective and deterministic equality constraints. We propose a Trust-Region +Sequential Quadratic Programming method to find both first- and second-order +stationary points. Our method utilizes a random model to represent the +objective function, which is constructed from stochastic observations of the +objective and is designed to satisfy proper adaptive accuracy conditions with a +high but fixed probability. To converge to first-order stationary points, our +method computes a gradient step in each iteration defined by minimizing a +quadratic approximation of the objective subject to a (relaxed) linear +approximation of the problem constraints and a trust-region constraint. To +converge to second-order stationary points, our method additionally computes an +eigen step to explore the negative curvature of the reduced Hessian matrix, as +well as a second-order correction step to address the potential Maratos effect, +which arises due to the nonlinearity of the problem constraints. Such an effect +may impede the method from moving away from saddle points. Both gradient and +eigen step computations leverage a novel parameter-free decomposition of the +step and the trust-region radius, accounting for the proportions among the +feasibility residual, optimality residual, and negative curvature. We establish +global almost sure first- and second-order convergence guarantees for our +method, and present computational results on CUTEst problems, regression +problems, and saddle-point problems to demonstrate its superiority over +existing line-search-based stochastic methods. + +
+
+ comment: 41 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CMamba: Channel Correlation Enhanced State Space Models for Multivariate + Time Series Forecasting + + +
+ Recent advancements in multivariate time series forecasting have been +propelled by Linear-based, Transformer-based, and Convolution-based models, +with Transformer-based architectures gaining prominence for their efficacy in +temporal and cross-channel mixing. More recently, Mamba, a state space model, +has emerged with robust sequence and feature mixing capabilities. However, the +suitability of the vanilla Mamba design for time series forecasting remains an +open question, particularly due to its inadequate handling of cross-channel +dependencies. Capturing cross-channel dependencies is critical in enhancing the +performance of multivariate time series prediction. Recent findings show that +self-attention excels in capturing cross-channel dependencies, whereas other +simpler mechanisms, such as MLP, may degrade model performance. This is +counterintuitive, as MLP, being a learnable architecture, should theoretically +capture both correlations and irrelevances, potentially leading to neutral or +improved performance. Diving into the self-attention mechanism, we attribute +the observed degradation in MLP performance to its lack of data dependence and +global receptive field, which result in MLP's lack of generalization ability. +Based on the above insights, we introduce a refined Mamba variant tailored for +time series forecasting. Our proposed model, \textbf{CMamba}, incorporates a +modified Mamba (M-Mamba) module for temporal dependencies modeling, a global +data-dependent MLP (GDD-MLP) to effectively capture cross-channel dependencies, +and a Channel Mixup mechanism to mitigate overfitting. Comprehensive +experiments conducted on seven real-world datasets demonstrate the efficacy of +our model in improving forecasting performance. + +
+
+
+
+
+ + ♻ ☆ Bayesian Matrix Decomposition and Applications + + +
+ The sole aim of this book is to give a self-contained introduction to +concepts and mathematical tools in Bayesian matrix decomposition in order to +seamlessly introduce matrix decomposition techniques and their applications in +subsequent sections. However, we clearly realize our inability to cover all the +useful and interesting results concerning Bayesian matrix decomposition and +given the paucity of scope to present this discussion, e.g., the separated +analysis of variational inference for conducting the optimization. We refer the +reader to literature in the field of Bayesian analysis for a more detailed +introduction to the related fields. + This book is primarily a summary of purpose, significance of important +Bayesian matrix decomposition methods, e.g., real-valued decomposition, +nonnegative matrix factorization, Bayesian interpolative decomposition, and the +origin and complexity of the methods which shed light on their applications. +The mathematical prerequisite is a first course in statistics and linear +algebra. Other than this modest background, the development is self-contained, +with rigorous proof provided throughout. + +
+
+
+
+
+ + ♻ ☆ Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling + + +
+ In current deep learning tasks, Adam style optimizers such as Adam, Adagrad, +RMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style +optimizers. These optimizers typically update model parameters using the sign +of gradients, resulting in more stable convergence curves. The learning rate +and the batch size are the most critical hyperparameters for optimizers, which +require careful tuning to enable effective convergence. Previous research has +shown that the optimal learning rate increases linearly or follows similar +rules with batch size for SGD style optimizers. However, this conclusion is not +applicable to Adam style optimizers. In this paper, we elucidate the connection +between optimal learning rates and batch sizes for Adam style optimizers +through both theoretical analysis and extensive experiments. First, we raise +the scaling law between batch sizes and optimal learning rates in the sign of +gradient case, in which we prove that the optimal learning rate first rises and +then falls as the batch size increases. Moreover, the peak value of the surge +will gradually move toward the larger batch size as training progresses. +Second, we conducted experiments on various CV and NLP tasks and verified the +correctness of the scaling law. + +
+
+
+
+
+ + ♻ ☆ Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles + Using Latent Space Generative World Models ICRA 2025 + + +
+ We propose the use of latent space generative world models to address the +covariate shift problem in autonomous driving. A world model is a neural +network capable of predicting an agent's next state given past states and +actions. By leveraging a world model during training, the driving policy +effectively mitigates covariate shift without requiring an excessive amount of +training data. During end-to-end training, our policy learns how to recover +from errors by aligning with states observed in human demonstrations, so that +at runtime it can recover from perturbations outside the training distribution. +Additionally, we introduce a novel transformer-based perception encoder that +employs multi-view cross-attention and a learned scene query. We present +qualitative and quantitative results, demonstrating significant improvements +upon prior state of the art in closed-loop testing in the CARLA simulator, as +well as showing the ability to handle perturbations in both CARLA and NVIDIA's +DRIVE Sim. + +
+
+ comment: 7 pages, 6 figures, for ICRA 2025 conference, for associated video + file, see https://youtu.be/fO7RZ57gVxk +
+
+
+
+
+ + ♻ ☆ EDA-DM: Enhanced Distribution Alignment for Post-Training Quantization + of Diffusion Models + + +
+ Diffusion models have achieved great success in image generation tasks +through iterative noise estimation. However, the heavy denoising process and +complex neural networks hinder their low-latency applications in real-world +scenarios. Quantization can effectively reduce model complexity, and +post-training quantization (PTQ), which does not require fine-tuning, is highly +promising for compressing and accelerating diffusion models. Unfortunately, we +find that due to the highly dynamic distribution of activations in different +denoising steps, existing PTQ methods for diffusion models suffer from +distribution mismatch issues at both calibration sample level and +reconstruction output level, which makes the performance far from satisfactory, +especially in low-bit cases. In this paper, we propose Enhanced Distribution +Alignment for Post-Training Quantization of Diffusion Models (EDA-DM) to +address the above issues. Specifically, at the calibration sample level, we +select calibration samples based on the density and variety in the latent +space, thus facilitating the alignment of their distribution with the overall +samples; and at the reconstruction output level, we modify the loss of block +reconstruction with the losses of layers, aligning the outputs of quantized +model and full-precision model at different network granularity. Extensive +experiments demonstrate that EDA-DM significantly outperforms the existing PTQ +methods across various models (DDIM, LDM-4, LDM-8, Stable-Diffusion) and +different datasets (CIFAR-10, LSUN-Bedroom, LSUN-Church, ImageNet, MS-COCO). + +
+
+ comment: Code: http://github.com/BienLuky/EDA-DM +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Revisiting Acoustic Similarity in Emotional Speech and Music via + Self-Supervised Representations + + +
+ Emotion recognition from speech and music shares similarities due to their +acoustic overlap, which has led to interest in transferring knowledge between +these domains. However, the shared acoustic cues between speech and music, +particularly those encoded by Self-Supervised Learning (SSL) models, remain +largely unexplored, given the fact that SSL models for speech and music have +rarely been applied in cross-domain research. In this work, we revisit the +acoustic similarity between emotion speech and music, starting with an analysis +of the layerwise behavior of SSL models for Speech Emotion Recognition (SER) +and Music Emotion Recognition (MER). Furthermore, we perform cross-domain +adaptation by comparing several approaches in a two-stage fine-tuning process, +examining effective ways to utilize music for SER and speech for MER. Lastly, +we explore the acoustic similarities between emotional speech and music using +Frechet audio distance for individual emotions, uncovering the issue of emotion +bias in both speech and music SSL models. Our findings reveal that while speech +and music SSL models do capture shared acoustic features, their behaviors can +vary depending on different emotions due to their training strategies and +domain-specificities. Additionally, parameter-efficient fine-tuning can enhance +SER and MER performance by leveraging knowledge from each other. This study +provides new insights into the acoustic similarity between emotional speech and +music, and highlights the potential for cross-domain generalization to improve +SER and MER systems. + +
+
+
+
+
+ + ☆ A Multimodal Single-Branch Embedding Network for Recommendation in + Cold-Start and Missing Modality Scenarios RecSys '24 + + +
+ Most recommender systems adopt collaborative filtering (CF) and provide +recommendations based on past collective interactions. Therefore, the +performance of CF algorithms degrades when few or no interactions are +available, a scenario referred to as cold-start. To address this issue, +previous work relies on models leveraging both collaborative data and side +information on the users or items. Similar to multimodal learning, these models +aim at combining collaborative and content representations in a shared +embedding space. In this work we propose a novel technique for multimodal +recommendation, relying on a multimodal Single-Branch embedding network for +Recommendation (SiBraR). Leveraging weight-sharing, SiBraR encodes interaction +data as well as multimodal side information using the same single-branch +embedding network on different modalities. This makes SiBraR effective in +scenarios of missing modality, including cold start. Our extensive experiments +on large-scale recommendation datasets from three different recommendation +domains (music, movie, and e-commerce) and providing multimodal content +information (audio, text, image, labels, and interactions) show that SiBraR +significantly outperforms CF as well as state-of-the-art content-based RSs in +cold-start scenarios, and is competitive in warm scenarios. We show that +SiBraR's recommendations are accurate in missing modality scenarios, and that +the model is able to map different modalities to the same region of the shared +embedding space, hence reducing the modality gap. + +
+
+ comment: Accepted at 18th ACM Conference on Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ Modeling the Popularity of Events on Web by Sparsity and + Mutual-Excitation Guided Graph Neural Network + + +
+ The content of a webpage described or posted an event in the cyberspace +inevitably reflects viewpoints, values and trends of the physical society. +Mapping an event on web to the popularity score plays a pivot role to sense the +social trends from the cyberspace. However, the complex semantic correspondence +between texts and images, as well as the implicit text-image-popularity mapping +mechanics pose a significant challenge to this non-trivial task. In this paper, +we address this problem from a viewpoint of understanding the interpretable +mapping mechanics. Concretely, we organize the keywords from different events +into an unified graph. The unified graph facilitates to model the popularity of +events via two-level mappings, i.e., the self excitation and the mutual +excitation. The self-excitation assumes that each keyword forms the popularity +while the mutual-excitation models that two keywords would excite each other to +determine the popularity of an event. Specifically, we use Graph Neural Network +(GNN) as the backbone to model the self-excitation, the mutual excitation and +the context of images into a sparse and deep factor model. Besides, to our best +knowledge, we release a challenge web event dataset for the popularity +prediction task. The experimental results on three public datasets demonstrate +that our method achieves significant improvements and outperforms the +state-of-the-art methods. Dataset is publicly available at: +https://github.com/pangjunbiao/Hot-events-dataset. + +
+
+
+
+
+ + ☆ Subjective and Objective Quality-of-Experience Evaluation Study for Live + Video Streaming + + +
+ In recent years, live video streaming has gained widespread popularity across +various social media platforms. Quality of experience (QoE), which reflects +end-users' satisfaction and overall experience, plays a critical role for media +service providers to optimize large-scale live compression and transmission +strategies to achieve perceptually optimal rate-distortion trade-off. Although +many QoE metrics for video-on-demand (VoD) have been proposed, there remain +significant challenges in developing QoE metrics for live video streaming. To +bridge this gap, we conduct a comprehensive study of subjective and objective +QoE evaluations for live video streaming. For the subjective QoE study, we +introduce the first live video streaming QoE dataset, TaoLive QoE, which +consists of $42$ source videos collected from real live broadcasts and $1,155$ +corresponding distorted ones degraded due to a variety of streaming +distortions, including conventional streaming distortions such as compression, +stalling, as well as live streaming-specific distortions like frame skipping, +variable frame rate, etc. Subsequently, a human study was conducted to derive +subjective QoE scores of videos in the TaoLive QoE dataset. For the objective +QoE study, we benchmark existing QoE models on the TaoLive QoE dataset as well +as publicly available QoE datasets for VoD scenarios, highlighting that current +models struggle to accurately assess video QoE, particularly for live content. +Hence, we propose an end-to-end QoE evaluation model, Tao-QoE, which integrates +multi-scale semantic features and optical flow-based motion features to +predicting a retrospective QoE score, eliminating reliance on statistical +quality of service (QoS) features. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code will be released soon +
+
+
+
+
+ + ♻ ☆ Exploring Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Although traditional cameras are commonly applied, their reliability decreases +in scenarios under high dynamic range or heavy motion blur, where event cameras +offer a robust solution. Predominant event-based methods accumulate events into +frames, ignoring the asynchronous and high temporal resolution that is crucial +for distinguishing distinct actions. To address this issue and to unlock the 3D +potential of event information, we introduce two 3D event representations: the +Rasterized Event Point Cloud (RasEPC) and the Decoupled Event Voxel (DEV). The +RasEPC aggregates events within concise temporal slices at identical positions, +preserving their 3D attributes along with statistical information, thereby +significantly reducing memory and computational demands. Meanwhile, the DEV +representation discretizes events into voxels and projects them across three +orthogonal planes, utilizing decoupled event attention to retrieve 3D cues from +the 2D planes. Furthermore, we develop and release EV-3DPW, a synthetic +event-based dataset crafted to facilitate training and quantitative analysis in +outdoor scenes. Our methods are tested on the DHP19 public dataset, MMHPSD +dataset, and our EV-3DPW dataset, with further qualitative validation via a +derived driving scene dataset EV-JAAD and an outdoor collection vehicle. Our +code and dataset have been made publicly available at +https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Accepted to Computer Vision and Image Understanding (CVPU). Extended + version of arXiv:2206.04511. The code and dataset are available at + https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Multimodal Fusion via Hypergraph Autoencoder and Contrastive Learning + for Emotion Recognition in Conversation + + +
+ Multimodal emotion recognition in conversation (MERC) seeks to identify the +speakers' emotions expressed in each utterance, offering significant potential +across diverse fields. The challenge of MERC lies in balancing speaker modeling +and context modeling, encompassing both long-distance and short-distance +contexts, as well as addressing the complexity of multimodal information +fusion. Recent research adopts graph-based methods to model intricate +conversational relationships effectively. Nevertheless, the majority of these +methods utilize a fixed fully connected structure to link all utterances, +relying on convolution to interpret complex context. This approach can +inherently heighten the redundancy in contextual messages and excessive graph +network smoothing, particularly in the context of long-distance conversations. +To address this issue, we propose a framework that dynamically adjusts +hypergraph connections by variational hypergraph autoencoder (VHGAE), and +employs contrastive learning to mitigate uncertainty factors during the +reconstruction process. Experimental results demonstrate the effectiveness of +our proposal against the state-of-the-art methods on IEMOCAP and MELD datasets. +We release the code to support the reproducibility of this work at +https://github.com/yzjred/-HAUCL. + +
+
+ comment: Accepted by ACM MULTIMEDIA 2024 +
+
+
+
+
+ + ♻ ☆ Arena: A Patch-of-Interest ViT Inference Acceleration System for + Edge-Assisted Video Analytics + + +
+ The advent of edge computing has made real-time intelligent video analytics +feasible. Previous works, based on traditional model architecture (e.g., CNN, +RNN, etc.), employ various strategies to filter out non-region-of-interest +content to minimize bandwidth and computation consumption but show inferior +performance in adverse environments. Recently, visual foundation models based +on transformers have shown great performance in adverse environments due to +their amazing generalization capability. However, they require a large amount +of computation power, which limits their applications in real-time intelligent +video analytics. In this paper, we find visual foundation models like Vision +Transformer (ViT) also have a dedicated acceleration mechanism for video +analytics. To this end, we introduce Arena, an end-to-end edge-assisted video +inference acceleration system based on ViT. We leverage the capability of ViT +that can be accelerated through token pruning by only offloading and feeding +Patches-of-Interest to the downstream models. Additionally, we design an +adaptive keyframe inference switching algorithm tailored to different videos, +capable of adapting to the current video content to jointly optimize accuracy +and bandwidth. Through extensive experiments, our findings reveal that Arena +can boost inference speeds by up to 1.58\(\times\) and 1.82\(\times\) on +average while consuming only 47\% and 31\% of the bandwidth, respectively, all +with high inference accuracy. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 55 + +
+
+
+ + ☆ HDFlow: Enhancing LLM Complex Problem-Solving with Hybrid Thinking and + Dynamic Workflows + + +
+ Despite recent advancements in large language models (LLMs), their +performance on complex reasoning problems requiring multi-step thinking and +combining various skills is still limited. To address this, we propose a novel +framework HDFlow for complex reasoning with LLMs that combines fast and slow +thinking modes in an adaptive manner. Our approach consists of two key +components: 1) a new approach for slow, deliberate reasoning called Dynamic +Workflow, which automatically decomposes complex problems into more manageable +sub-tasks and dynamically designs a workflow to assemble specialized LLM or +symbolic reasoning tools to solve sub-tasks; 2) Hybrid Thinking, a general +framework that dynamically combines fast and slow thinking based on problem +complexity. Finally, we propose an easy-to-scale method for automatically +synthesizing a large-scale dataset of 27K challenging reasoning problems for +complex reasoning and a hybrid thinking tuning method that trains smaller LLMs +on this dataset to internalize the fast/slow hybrid reasoning strategies. +Experiments on four reasoning benchmark datasets demonstrate that our slow +thinking with dynamic workflows significantly outperforms Chain-of-Thought, and +hybrid thinking achieves the highest accuracy while providing an effective +balance between computational efficiency and performance. Fine-tuning using our +hybrid thinking approach also significantly boosts the complex reasoning +capabilities of open-source language models. The results showcase the promise +of slow thinking, dynamic workflows, and hybrid thinking in expanding the +frontier of complex problem-solving with LLMs\footnote{Code and data will be +released at \url{https://github.com/wenlinyao/HDFlow}.}. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ☆ On Extending Direct Preference Optimization to Accommodate Ties + + +
+ We derive and investigate two DPO variants that explicitly model the +possibility of declaring a tie in pair-wise comparisons. We replace the +Bradley-Terry model in DPO with two well-known modeling extensions, by Rao and +Kupper and by Davidson, that assign probability to ties as alternatives to +clear preferences. Our experiments in neural machine translation and +summarization show that explicitly labeled ties can be added to the datasets +for these DPO variants without the degradation in task performance that is +observed when the same tied pairs are presented to DPO. We find empirically +that the inclusion of ties leads to stronger regularization with respect to the +reference policy as measured by KL divergence, and we see this even for DPO in +its original form. These findings motivate and enable the inclusion of tied +pairs in preference optimization as opposed to simply discarding them. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Discovering the Gems in Early Layers: Accelerating Long-Context LLMs + with 1000x Input Token Reduction + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +handling long context inputs, but this comes at the cost of increased +computational resources and latency. Our research introduces a novel approach +for the long context bottleneck to accelerate LLM inference and reduce GPU +memory consumption. Our research demonstrates that LLMs can identify relevant +tokens in the early layers before generating answers to a query. Leveraging +this insight, we propose an algorithm that uses early layers of an LLM as +filters to select and compress input tokens, significantly reducing the context +length for subsequent processing. Our method, GemFilter, demonstrates +substantial improvements in both speed and memory efficiency compared to +existing techniques, such as standard attention and SnapKV/H2O. Notably, it +achieves a 2.4$\times$ speedup and 30\% reduction in GPU memory usage compared +to SOTA methods. Evaluation on the Needle in a Haystack task shows that +GemFilter significantly outperforms standard attention, SnapKV and demonstrates +comparable performance on the LongBench challenge. GemFilter is simple, +training-free, and broadly applicable across different LLMs. Crucially, it +provides interpretability by allowing humans to inspect the selected input +sequence. These findings not only offer practical benefits for LLM deployment, +but also enhance our understanding of LLM internal mechanisms, paving the way +for further optimizations in LLM design and inference. Our code is available at +\url{https://github.com/SalesforceAIResearch/GemFilter}. + +
+
+
+
+
+ + ☆ Pre-Finetuning with Impact Duration Awareness for Stock Movement + Prediction + + +
+ Understanding the duration of news events' impact on the stock market is +crucial for effective time-series forecasting, yet this facet is largely +overlooked in current research. This paper addresses this research gap by +introducing a novel dataset, the Impact Duration Estimation Dataset (IDED), +specifically designed to estimate impact duration based on investor opinions. +Our research establishes that pre-finetuning language models with IDED can +enhance performance in text-based stock movement predictions. In addition, we +juxtapose our proposed pre-finetuning task with sentiment analysis +pre-finetuning, further affirming the significance of learning impact duration. +Our findings highlight the promise of this novel research direction in stock +movement prediction, offering a new avenue for financial forecasting. We also +provide the IDED and pre-finetuned language models under the CC BY-NC-SA 4.0 +license for academic use, fostering further exploration in this field. + +
+
+ comment: NTCIR-18 FinArg-2 Dataset +
+
+
+
+
+ + ☆ Enhancing Investment Opinion Ranking through Argument-Based Sentiment + Analysis + + +
+ In the era of rapid Internet and social media platform development, +individuals readily share their viewpoints online. The overwhelming quantity of +these posts renders comprehensive analysis impractical. This necessitates an +efficient recommendation system to filter and present significant, relevant +opinions. Our research introduces a dual-pronged argument mining technique to +improve recommendation system effectiveness, considering both professional and +amateur investor perspectives. Our first strategy involves using the +discrepancy between target and closing prices as an opinion indicator. The +second strategy applies argument mining principles to score investors' +opinions, subsequently ranking them by these scores. Experimental results +confirm the effectiveness of our approach, demonstrating its ability to +identify opinions with higher profit potential. Beyond profitability, our +research extends to risk analysis, examining the relationship between +recommended opinions and investor behaviors. This offers a holistic view of +potential outcomes following the adoption of these recommended opinions. + +
+
+
+
+
+ + ☆ From Deception to Detection: The Dual Roles of Large Language Models in + Fake News + + +
+ Fake news poses a significant threat to the integrity of information +ecosystems and public trust. The advent of Large Language Models (LLMs) holds +considerable promise for transforming the battle against fake news. Generally, +LLMs represent a double-edged sword in this struggle. One major concern is that +LLMs can be readily used to craft and disseminate misleading information on a +large scale. This raises the pressing questions: Can LLMs easily generate +biased fake news? Do all LLMs have this capability? Conversely, LLMs offer +valuable prospects for countering fake news, thanks to their extensive +knowledge of the world and robust reasoning capabilities. This leads to other +critical inquiries: Can we use LLMs to detect fake news, and do they outperform +typical detection models? In this paper, we aim to address these pivotal +questions by exploring the performance of various LLMs. Our objective is to +explore the capability of various LLMs in effectively combating fake news, +marking this as the first investigation to analyze seven such models. Our +results reveal that while some models adhere strictly to safety protocols, +refusing to generate biased or misleading content, other models can readily +produce fake news across a spectrum of biases. Additionally, our results show +that larger models generally exhibit superior detection abilities and that +LLM-generated fake news are less likely to be detected than human-written ones. +Finally, our findings demonstrate that users can benefit from LLM-generated +explanations in identifying fake news. + +
+
+
+
+
+ + ☆ Post-hoc Reward Calibration: A Case Study on Length Bias + + +
+ Reinforcement Learning from Human Feedback aligns the outputs of Large +Language Models with human values and preferences. Central to this process is +the reward model (RM), which translates human feedback into training signals +for optimising LLM behaviour. However, RMs can develop biases by exploiting +spurious correlations in their training data, such as favouring outputs based +on length or style rather than true quality. These biases can lead to incorrect +output rankings, sub-optimal model evaluations, and the amplification of +undesirable behaviours in LLMs alignment. This paper addresses the challenge of +correcting such biases without additional data and training, introducing the +concept of Post-hoc Reward Calibration. We first propose an intuitive approach +to estimate the bias term and, thus, remove it to approximate the underlying +true reward. We then extend the approach to a more general and robust form with +the Locally Weighted Regression. Focusing on the prevalent length bias, we +validate our proposed approaches across three experimental settings, +demonstrating consistent improvements: (1) a 3.11 average performance gain +across 33 reward models on the RewardBench dataset; (2) enhanced alignment of +RM rankings with GPT-4 evaluations and human preferences based on the +AlpacaEval benchmark; and (3) improved Length-Controlled win rate of the RLHF +process in multiple LLM--RM combinations. Our method is computationally +efficient and generalisable to other types of bias and RMs, offering a scalable +and robust solution for mitigating biases in LLM alignment. Our code and +results are available at https://github.com/ZeroYuHuang/Reward-Calibration. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Severity Prediction in Mental Health: LLM-based Creation, Analysis, + Evaluation of a Novel Multilingual Dataset + + +
+ Large Language Models (LLMs) are increasingly integrated into various medical +fields, including mental health support systems. However, there is a gap in +research regarding the effectiveness of LLMs in non-English mental health +support applications. To address this problem, we present a novel multilingual +adaptation of widely-used mental health datasets, translated from English into +six languages (Greek, Turkish, French, Portuguese, German, and Finnish). This +dataset enables a comprehensive evaluation of LLM performance in detecting +mental health conditions and assessing their severity across multiple +languages. By experimenting with GPT and Llama, we observe considerable +variability in performance across languages, despite being evaluated on the +same translated dataset. This inconsistency underscores the complexities +inherent in multilingual mental health support, where language-specific nuances +and mental health data coverage can affect the accuracy of the models. Through +comprehensive error analysis, we emphasize the risks of relying exclusively on +large language models (LLMs) in medical settings (e.g., their potential to +contribute to misdiagnoses). Moreover, our proposed approach offers significant +cost savings for multilingual tasks, presenting a major advantage for +broad-scale implementation. + +
+
+
+
+
+ + ☆ Scaling Behavior for Large Language Models regarding Numeral Systems: An + Example using Pythia EMNLP 2024 + + +
+ Though Large Language Models (LLMs) have shown remarkable abilities in +mathematics reasoning, they are still struggling with performing numeric +operations accurately, such as addition and multiplication. Numbers can be +tokenized into tokens in various ways by different LLMs and affect the numeric +operations performance. Currently, there are two representatives: 1) Tokenize +into $1$-digit, and 2) Tokenize into $1\sim 3$ digit. The difference is roughly +equivalent to using different numeral systems (namely base $10$ or base +$10^{3}$). In light of this, we study the scaling behavior of different numeral +systems in the context of transformer-based large language models. We +empirically show that a base $10$ system is consistently more data-efficient +than a base $10^{2}$ or $10^{3}$ system across training data scale, model sizes +under from-scratch training settings, while different number systems have very +similar fine-tuning performances. We attribute this to higher token frequencies +of a base $10$ system. Additionally, we reveal extrapolation behavior patterns +on addition and multiplication. We identify that base $100$ and base $1000$ +systems struggle on token-level discernment and token-level operations. We also +sheds light on the mechanism learnt by the models. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ☆ data2lang2vec: Data Driven Typological Features Completion + + +
+ Language typology databases enhance multi-lingual Natural Language Processing +(NLP) by improving model adaptability to diverse linguistic structures. The +widely-used lang2vec toolkit integrates several such databases, but its +coverage remains limited at 28.9\%. Previous work on automatically increasing +coverage predicts missing values based on features from other languages or +focuses on single features, we propose to use textual data for better-informed +feature prediction. To this end, we introduce a multi-lingual Part-of-Speech +(POS) tagger, achieving over 70\% accuracy across 1,749 languages, and +experiment with external statistical features and a variety of machine learning +algorithms. We also introduce a more realistic evaluation setup, focusing on +likely to be missing typology features, and show that our approach outperforms +previous work in both setups. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ☆ Internalizing ASR with Implicit Chain of Thought for Efficient + Speech-to-Speech Conversational LLM + + +
+ Current speech-based LLMs are predominantly trained on extensive ASR and TTS +datasets, excelling in tasks related to these domains. However, their ability +to handle direct speech-to-speech conversations remains notably constrained. +These models often rely on an ASR-to-TTS chain-of-thought pipeline, converting +speech into text for processing before generating audio responses, which +introduces latency and loses audio features. We propose a method that +implicitly internalizes ASR chain of thought into a speech LLM, enhancing its +native speech understanding capabilities. Our approach reduces latency and +improves the model's native understanding of speech, paving the way for more +efficient and natural real-time audio interactions. We also release a +large-scale synthetic conversational dataset to facilitate further research. + +
+
+
+
+
+ + ☆ How Transliterations Improve Crosslingual Alignment + + +
+ Recent studies have shown that post-aligning multilingual pretrained language +models (mPLMs) using alignment objectives on both original and transliterated +data can improve crosslingual alignment. This improvement further leads to +better crosslingual transfer performance. However, it remains unclear how and +why a better crosslingual alignment is achieved, as this technique only +involves transliterations, and does not use any parallel data. This paper +attempts to explicitly evaluate the crosslingual alignment and identify the key +elements in transliteration-based approaches that contribute to better +performance. For this, we train multiple models under varying setups for two +pairs of related languages: (1) Polish and Ukrainian and (2) Hindi and Urdu. To +assess alignment, we define four types of similarities based on sentence +representations. Our experiments show that adding transliterations alone +improves the overall similarities, even for random sentence pairs. With the +help of auxiliary alignment objectives, especially the contrastive objective, +the model learns to distinguish matched from random pairs, leading to better +alignments. However, we also show that better alignment does not always yield +better downstream performance, suggesting that further research is needed to +clarify the connection between alignment and performance. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Navigating the Nuances: A Fine-grained Evaluation of Vision-Language + Navigation EMNLP 2024 + + +
+ This study presents a novel evaluation framework for the Vision-Language +Navigation (VLN) task. It aims to diagnose current models for various +instruction categories at a finer-grained level. The framework is structured +around the context-free grammar (CFG) of the task. The CFG serves as the basis +for the problem decomposition and the core premise of the instruction +categories design. We propose a semi-automatic method for CFG construction with +the help of Large-Language Models (LLMs). Then, we induct and generate data +spanning five principal instruction categories (i.e. direction change, landmark +recognition, region recognition, vertical movement, and numerical +comprehension). Our analysis of different models reveals notable performance +discrepancies and recurrent issues. The stagnation of numerical comprehension, +heavy selective biases over directional concepts, and other interesting +findings contribute to the development of future language-guided navigation +systems. + +
+
+ comment: EMNLP 2024 Findings; project page: + https://zehao-wang.github.io/navnuances +
+
+
+
+
+ + ☆ BabyLlama-2: Ensemble-Distilled Models Consistently Outperform Teachers + With Limited Data CoNLL 2024 + + +
+ We present BabyLlama-2, a 345 million parameter model distillation-pretrained +from two teachers on a 10 million word corpus for the BabyLM competition. On +BLiMP and SuperGLUE benchmarks, BabyLlama-2 outperforms baselines trained on +both 10 and 100 million word datasets with the same data mix, as well as its +teacher models. Through an extensive hyperparameter sweep, we demonstrate that +the advantages of distillation cannot be attributed to suboptimal +hyperparameter selection of the teachers. Our findings underscore the need for +further investigation into distillation techniques, particularly in +data-limited settings. + +
+
+ comment: 9 pages, 3 figures, 5 tables, submitted to the BabyLM Challenge + (CoNLL 2024 Shared Task) +
+
+
+
+
+ + ☆ Proof of Thought : Neurosymbolic Program Synthesis allows Robust and + Interpretable Reasoning + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +yet they struggle with inconsistent reasoning, particularly in novel domains +and complex logical sequences. This research introduces Proof of Thought, a +framework that enhances the reliability and transparency of LLM outputs. Our +approach bridges LLM-generated ideas with formal logic verification, employing +a custom interpreter to convert LLM outputs into First Order Logic constructs +for theorem prover scrutiny. Central to our method is an intermediary +JSON-based Domain-Specific Language, which by design balances precise logical +structures with intuitive human concepts. This hybrid representation enables +both rigorous validation and accessible human comprehension of LLM reasoning +processes. Key contributions include a robust type system with sort management +for enhanced logical integrity, explicit representation of rules for clear +distinction between factual and inferential knowledge, and a flexible +architecture that allows for easy extension to various domain-specific +applications. We demonstrate Proof of Thought's effectiveness through +benchmarking on StrategyQA and a novel multimodal reasoning task, showing +improved performance in open-ended scenarios. By providing verifiable and +interpretable results, our technique addresses critical needs for AI system +accountability and sets a foundation for human-in-the-loop oversight in +high-stakes domains. + +
+
+
+
+
+ + ☆ Molmo and PixMo: Open Weights and Open Data for State-of-the-Art + Multimodal Models + + +
+ Today's most advanced multimodal models remain proprietary. The strongest +open-weight models rely heavily on synthetic data from proprietary VLMs to +achieve good performance, effectively distilling these closed models into open +ones. As a result, the community is still missing foundational knowledge about +how to build performant VLMs from scratch. We present Molmo, a new family of +VLMs that are state-of-the-art in their class of openness. Our key innovation +is a novel, highly detailed image caption dataset collected entirely from human +annotators using speech-based descriptions. To enable a wide array of user +interactions, we also introduce a diverse dataset mixture for fine-tuning that +includes in-the-wild Q&A and innovative 2D pointing data. The success of our +approach relies on careful choices for the model architecture details, a +well-tuned training pipeline, and, most critically, the quality of our newly +collected datasets, all of which will be released. The best-in-class 72B model +within the Molmo family not only outperforms others in the class of open weight +and data models but also compares favorably against proprietary systems like +GPT-4o, Claude 3.5, and Gemini 1.5 on both academic benchmarks and human +evaluation. + We will be releasing all of our model weights, captioning and fine-tuning +data, and source code in the near future. Select model weights, inference code, +and demo are available at https://molmo.allenai.org. + +
+
+
+
+
+ + ☆ FineZip : Pushing the Limits of Large Language Models for Practical + Lossless Text Compression + + +
+ While the language modeling objective has been shown to be deeply connected +with compression, it is surprising that modern LLMs are not employed in +practical text compression systems. In this paper, we provide an in-depth +analysis of neural network and transformer-based compression techniques to +answer this question. We compare traditional text compression systems with +neural network and LLM-based text compression methods. Although LLM-based +systems significantly outperform conventional compression methods, they are +highly impractical. Specifically, LLMZip, a recent text compression system +using Llama3-8B requires 9.5 days to compress just 10 MB of text, although with +huge improvements in compression ratios. To overcome this, we present FineZip - +a novel LLM-based text compression system that combines ideas of online +memorization and dynamic context to reduce the compression time immensely. +FineZip can compress the above corpus in approximately 4 hours compared to 9.5 +days, a 54 times improvement over LLMZip and comparable performance. FineZip +outperforms traditional algorithmic compression methods with a large margin, +improving compression ratios by approximately 50\%. With this work, we take the +first step towards making lossless text compression with LLMs a reality. While +FineZip presents a significant step in that direction, LLMs are still not a +viable solution for large-scale text compression. We hope our work paves the +way for future research and innovation to solve this problem. + +
+
+
+
+
+ + ☆ Assessing the Level of Toxicity Against Distinct Groups in Bangla Social + Media Comments: A Comprehensive Investigation + + +
+ Social media platforms have a vital role in the modern world, serving as +conduits for communication, the exchange of ideas, and the establishment of +networks. However, the misuse of these platforms through toxic comments, which +can range from offensive remarks to hate speech, is a concerning issue. This +study focuses on identifying toxic comments in the Bengali language targeting +three specific groups: transgender people, indigenous people, and migrant +people, from multiple social media sources. The study delves into the intricate +process of identifying and categorizing toxic language while considering the +varying degrees of toxicity: high, medium, and low. The methodology involves +creating a dataset, manual annotation, and employing pre-trained transformer +models like Bangla-BERT, bangla-bert-base, distil-BERT, and +Bert-base-multilingual-cased for classification. Diverse assessment metrics +such as accuracy, recall, precision, and F1-score are employed to evaluate the +model's effectiveness. The experimental findings reveal that Bangla-BERT +surpasses alternative models, achieving an F1-score of 0.8903. This research +exposes the complexity of toxicity in Bangla social media dialogues, revealing +its differing impacts on diverse demographic groups. + +
+
+ comment: Accepted for publication in "18th International Conference on + Information Technology and Applications (ICITA 2024)" +
+
+
+
+
+ + ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by democratic deliberation theory, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ☆ Deep Learning and Machine Learning, Advancing Big Data Analytics and + Management: Handy Appetizer + + +
+ This book explores the role of Artificial Intelligence (AI), Machine Learning +(ML), and Deep Learning (DL) in driving the progress of big data analytics and +management. The book focuses on simplifying the complex mathematical concepts +behind deep learning, offering intuitive visualizations and practical case +studies to help readers understand how neural networks and technologies like +Convolutional Neural Networks (CNNs) work. It introduces several classic models +and technologies such as Transformers, GPT, ResNet, BERT, and YOLO, +highlighting their applications in fields like natural language processing, +image recognition, and autonomous driving. The book also emphasizes the +importance of pre-trained models and how they can enhance model performance and +accuracy, with instructions on how to apply these models in various real-world +scenarios. Additionally, it provides an overview of key big data management +technologies like SQL and NoSQL databases, as well as distributed computing +frameworks such as Apache Hadoop and Spark, explaining their importance in +managing and processing vast amounts of data. Ultimately, the book underscores +the value of mastering deep learning and big data management skills as critical +tools for the future workforce, making it an essential resource for both +beginners and experienced professionals. + +
+
+ comment: This book contains 93 pages and 60 figures +
+
+
+
+
+ + ☆ Programming Every Example: Lifting Pre-training Data Quality like + Experts at Scale + + +
+ Large language model pre-training has traditionally relied on human experts +to craft heuristics for improving the corpora quality, resulting in numerous +rules developed to date. However, these rules lack the flexibility to address +the unique characteristics of individual example effectively. Meanwhile, +applying tailored rules to every example is impractical for human experts. In +this paper, we demonstrate that even small language models, with as few as 0.3B +parameters, can exhibit substantial data refining capabilities comparable to +those of human experts. We introduce Programming Every Example (ProX), a novel +framework that treats data refinement as a programming task, enabling models to +refine corpora by generating and executing fine-grained operations, such as +string normalization, for each individual example at scale. Experimental +results show that models pre-trained on ProX-curated data outperform either +original data or data filtered by other selection methods by more than 2% +across various downstream benchmarks. Its effectiveness spans various model +sizes and pre-training corpora, including C4, RedPajama-V2, and FineWeb. +Furthermore, ProX exhibits significant potential in domain-specific continual +pre-training: without domain specific design, models trained on OpenWebMath +refined by ProX outperform human-crafted rule-based methods, improving average +accuracy by 7.6% over Mistral-7B, with 14.6% for Llama-2-7B and 20.3% for +CodeLlama-7B, all within 10B tokens to be comparable to models like Llemma-7B +trained on 200B tokens. Further analysis highlights that ProX significantly +saves training FLOPs, offering a promising path for efficient LLM +pre-training.We are open-sourcing ProX with >100B corpus, models, and sharing +all training and implementation details for reproducible research and future +innovation. Code: https://github.com/GAIR-NLP/ProX + +
+
+ comment: 45 pages, 13 figures, 34 tables +
+
+
+
+
+ + ☆ Can Vision Language Models Learn from Visual Demonstrations of Ambiguous + Spatial Reasoning? + + +
+ Large vision-language models (VLMs) have become state-of-the-art for many +computer vision tasks, with in-context learning (ICL) as a popular adaptation +strategy for new ones. But can VLMs learn novel concepts purely from visual +demonstrations, or are they limited to adapting to the output format of ICL +examples? We propose a new benchmark we call Spatial Visual Ambiguity Tasks +(SVAT) that challenges state-of-the-art VLMs to learn new visuospatial tasks +in-context. We find that VLMs fail to do this zero-shot, and sometimes continue +to fail after finetuning. However, adding simpler data to the training by +curriculum learning leads to improved ICL performance. + +
+
+ comment: 13 pages, 4 figures. Code released at + https://github.com/groundlight/vlm-visual-demonstrations +
+
+
+
+
+ + ☆ Enhancing Post-Hoc Attributions in Long Document Comprehension via + Coarse Grained Answer Decomposition + + +
+ Accurately attributing answer text to its source document is crucial for +developing a reliable question-answering system. However, attribution for long +documents remains largely unexplored. Post-hoc attribution systems are designed +to map answer text back to the source document, yet the granularity of this +mapping has not been addressed. Furthermore, a critical question arises: What +precisely should be attributed, with an emphasis on identifying the information +units within an answer that necessitate grounding? In this paper, we propose +and investigate a novel approach to the factual decomposition of generated +answers for attribution, employing template-based in-context learning. To +accomplish this, we utilize the question and integrate negative sampling during +few-shot in-context learning for decomposition. This approach enhances the +semantic understanding of both abstractive and extractive answers. We examine +the impact of answer decomposition by providing a thorough examination of +various attribution approaches, ranging from retrieval-based techniques to +LLM-based attributors. + +
+
+
+
+
+ + ☆ Using LLM for Real-Time Transcription and Summarization of + Doctor-Patient Interactions into ePuskesmas in Indonesia + + +
+ One of the key issues contributing to inefficiency in Puskesmas is the +time-consuming nature of doctor-patient interactions. Doctors need to conduct +thorough consultations, which include diagnosing the patient's condition, +providing treatment advice, and transcribing detailed notes into medical +records. In regions with diverse linguistic backgrounds, doctors often have to +ask clarifying questions, further prolonging the process. While diagnosing is +essential, transcription and summarization can often be automated using AI to +improve time efficiency and help doctors enhance care quality and enable early +diagnosis and intervention. This paper proposes a solution using a localized +large language model (LLM) to transcribe, translate, and summarize +doctor-patient conversations. We utilize the Whisper model for transcription +and GPT-3 to summarize them into the ePuskemas medical records format. This +system is implemented as an add-on to an existing web browser extension, +allowing doctors to fill out patient forms while talking. By leveraging this +solution for real-time transcription, translation, and summarization, doctors +can improve the turnaround time for patient care while enhancing the quality of +records, which become more detailed and insightful for future visits. This +innovation addresses challenges like overcrowded facilities and the +administrative burden on healthcare providers in Indonesia. We believe this +solution will help doctors save time, provide better care, and produce more +accurate medical records, representing a significant step toward modernizing +healthcare and ensuring patients receive timely, high-quality care, even in +resource-constrained settings. + +
+
+
+
+
+ + ☆ Detecting Temporal Ambiguity in Questions EMNLP 2024 + + +
+ Detecting and answering ambiguous questions has been a challenging task in +open-domain question answering. Ambiguous questions have different answers +depending on their interpretation and can take diverse forms. Temporally +ambiguous questions are one of the most common types of such questions. In this +paper, we introduce TEMPAMBIQA, a manually annotated temporally ambiguous QA +dataset consisting of 8,162 open-domain questions derived from existing +datasets. Our annotations focus on capturing temporal ambiguity to study the +task of detecting temporally ambiguous questions. We propose a novel approach +by using diverse search strategies based on disambiguated versions of the +questions. We also introduce and test non-search, competitive baselines for +detecting temporal ambiguity using zero-shot and few-shot approaches. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ How to Connect Speech Foundation Models and Large Language Models? What + Matters and What Does Not + + +
+ The remarkable performance achieved by Large Language Models (LLM) has driven +research efforts to leverage them for a wide range of tasks and input +modalities. In speech-to-text (S2T) tasks, the emerging solution consists of +projecting the output of the encoder of a Speech Foundational Model (SFM) into +the LLM embedding space through an adapter module. However, no work has yet +investigated how much the downstream-task performance depends on each component +(SFM, adapter, LLM) nor whether the best design of the adapter depends on the +chosen SFM and LLM. To fill this gap, we evaluate the combination of 5 adapter +modules, 2 LLMs (Mistral and Llama), and 2 SFMs (Whisper and SeamlessM4T) on +two widespread S2T tasks, namely Automatic Speech Recognition and Speech +Translation. Our results demonstrate that the SFM plays a pivotal role in +downstream performance, while the adapter choice has moderate impact and +depends on the SFM and LLM. + +
+
+
+
+
+ + ☆ Counterfactual Token Generation in Large Language Models + + +
+ "Sure, I am happy to generate a story for you: Captain Lyra stood at the helm +of her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...] +Lyra's eyes welled up with tears as she realized the bitter truth - she had +sacrificed everything for fleeting riches, and lost the love of her crew, her +family, and herself." Although this story, generated by a large language model, +is captivating, one may wonder -- how would the story have unfolded if the +model had chosen "Captain Maeve" as the protagonist instead? We cannot know. +State-of-the-art large language models are stateless -- they maintain no +internal memory or state. Given a prompt, they generate a sequence of tokens as +an output using an autoregressive process. As a consequence, they cannot reason +about counterfactual alternatives to tokens they have generated in the past. In +this work, our goal is to enhance them with this functionality. To this end, we +develop a causal model of token generation that builds upon the Gumbel-Max +structural causal model. Our model allows any large language model to perform +counterfactual token generation at almost no cost in comparison with vanilla +token generation, it is embarrassingly simple to implement, and it does not +require any fine-tuning nor prompt engineering. We implement our model on Llama +3 8B-instruct and conduct both qualitative and quantitative analyses of +counterfactually generated text. We conclude with a demonstrative application +of counterfactual token generation for bias detection, unveiling interesting +insights about the model of the world constructed by large language models. + +
+
+
+
+
+ + ☆ LLM-CARD: Towards a Description and Landscape of Large Language Models + + +
+ With the rapid growth of the Natural Language Processing (NLP) field, a vast +variety of Large Language Models (LLMs) continue to emerge for diverse NLP +tasks. As an increasing number of papers are presented, researchers and +developers face the challenge of information overload. Thus, it is particularly +important to develop a system that can automatically extract and organise key +information about LLMs from academic papers (\textbf{LLM model card}). This +work is to develop such a pioneer system by using Named Entity Recognition +(\textbf{NER}) and Relation Extraction (\textbf{RE}) methods that automatically +extract key information about large language models from the papers, helping +researchers to efficiently access information about LLMs. These features +include model \textit{licence}, model \textit{name}, and model +\textit{application}. With these features, we can form a model card for each +paper. \textbf{Data-contribution} wise, 106 academic papers were processed by +defining three dictionaries - LLMs name, licence, and application. 11,051 +sentences were extracted through dictionary lookup, and the dataset was +constructed through manual review of the final selection of 129 sentences that +have a link between the name and the licence, and 106 sentences that have a +link between the model name and the application. + +
+
+ comment: ongoing work, 16 pages +
+
+
+
+
+ + ☆ Models Can and Should Embrace the Communicative Nature of + Human-Generated Math + + +
+ Math is constructed by people for people: just as natural language corpora +reflect not just propositions but the communicative goals of language users, +the math data that models are trained on reflects not just idealized +mathematical entities but rich communicative intentions. While there are +important advantages to treating math in a purely symbolic manner, we here +hypothesize that there are benefits to treating math as situated linguistic +communication and that language models are well suited for this goal, in ways +that are not fully appreciated. We illustrate these points with two case +studies. First, we ran an experiment in which we found that language models +interpret the equals sign in a humanlike way -- generating systematically +different word problems for the same underlying equation arranged in different +ways. Second, we found that language models prefer proofs to be ordered in +naturalistic ways, even though other orders would be logically equivalent. We +advocate for AI systems that learn from and represent the communicative +intentions latent in human-generated math. + +
+
+
+
+
+ + ☆ AXCEL: Automated eXplainable Consistency Evaluation using LLMs + + +
+ Large Language Models (LLMs) are widely used in both industry and academia +for various tasks, yet evaluating the consistency of generated text responses +continues to be a challenge. Traditional metrics like ROUGE and BLEU show a +weak correlation with human judgment. More sophisticated metrics using Natural +Language Inference (NLI) have shown improved correlations but are complex to +implement, require domain-specific training due to poor cross-domain +generalization, and lack explainability. More recently, prompt-based metrics +using LLMs as evaluators have emerged; while they are easier to implement, they +still lack explainability and depend on task-specific prompts, which limits +their generalizability. This work introduces Automated eXplainable Consistency +Evaluation using LLMs (AXCEL), a prompt-based consistency metric which offers +explanations for the consistency scores by providing detailed reasoning and +pinpointing inconsistent text spans. AXCEL is also a generalizable metric which +can be adopted to multiple tasks without changing the prompt. AXCEL outperforms +both non-prompt and prompt-based state-of-the-art (SOTA) metrics in detecting +inconsistencies across summarization by 8.7%, free text generation by 6.2%, and +data-to-text conversion tasks by 29.4%. We also evaluate the influence of +underlying LLMs on prompt based metric performance and recalibrate the SOTA +prompt-based metrics with the latest LLMs for fair comparison. Further, we show +that AXCEL demonstrates strong performance using open source LLMs. + +
+
+
+
+
+ + ☆ Decoding Large-Language Models: A Systematic Overview of Socio-Technical + Impacts, Constraints, and Emerging Questions + + +
+ There have been rapid advancements in the capabilities of large language +models (LLMs) in recent years, greatly revolutionizing the field of natural +language processing (NLP) and artificial intelligence (AI) to understand and +interact with human language. Therefore, in this work, we conduct a systematic +investigation of the literature to identify the prominent themes and directions +of LLM developments, impacts, and limitations. Our findings illustrate the +aims, methodologies, limitations, and future directions of LLM research. It +includes responsible development considerations, algorithmic improvements, +ethical challenges, and societal implications of LLM development. Overall, this +paper provides a rigorous and comprehensive overview of current research in LLM +and identifies potential directions for future development. The article +highlights the application areas that could have a positive impact on society +along with the ethical considerations. + +
+
+ comment: 28 pages, 5 figures, preprint submitted to journal +
+
+
+
+
+ + ☆ Adaptive Self-Supervised Learning Strategies for Dynamic On-Device LLM + Personalization + + +
+ Large language models (LLMs) have revolutionized how we interact with +technology, but their personalization to individual user preferences remains a +significant challenge, particularly in on-device applications. Traditional +methods often depend heavily on labeled datasets and can be resource-intensive. +To address these issues, we present Adaptive Self-Supervised Learning +Strategies (ASLS), which utilizes self-supervised learning techniques to +personalize LLMs dynamically. The framework comprises a user profiling layer +for collecting interaction data and a neural adaptation layer for real-time +model fine-tuning. This innovative approach enables continuous learning from +user feedback, allowing the model to generate responses that align closely with +user-specific contexts. The adaptive mechanisms of ASLS minimize computational +demands and enhance personalization efficiency. Experimental results across +various user scenarios illustrate the superior performance of ASLS in boosting +user engagement and satisfaction, highlighting its potential to redefine LLMs +as highly responsive and context-aware systems on-device. + +
+
+ comment: First ASLS +
+
+
+
+
+ + ☆ Weighted Cross-entropy for Low-Resource Languages in Multilingual Speech + Recognition + + +
+ This paper addresses the challenge of integrating low-resource languages into +multilingual automatic speech recognition (ASR) systems. We introduce a novel +application of weighted cross-entropy, typically used for unbalanced datasets, +to facilitate the integration of low-resource languages into pre-trained +multilingual ASR models within the context of continual multilingual learning. +We fine-tune the Whisper multilingual ASR model on five high-resource languages +and one low-resource language, employing language-weighted dynamic +cross-entropy and data augmentation. The results show a remarkable 6.69% word +error rate (WER) reduction for the low-resource language compared to the +fine-tuned model without applying our approach, and a 48.86% WER reduction +compared to the original Whisper model. In addition, our approach yields an +average WER reduction of 3.29% across the six languages, showing no degradation +for the high-resource languages. + +
+
+ comment: 5 pages, 1 figure. Presented at Interspeech 2024 +
+
+
+
+
+ + ☆ Semi-Supervised Cognitive State Classification from Speech with + Multi-View Pseudo-Labeling + + +
+ The lack of labeled data is a common challenge in speech classification +tasks, particularly those requiring extensive subjective assessment, such as +cognitive state classification. In this work, we propose a Semi-Supervised +Learning (SSL) framework, introducing a novel multi-view pseudo-labeling method +that leverages both acoustic and linguistic characteristics to select the most +confident data for training the classification model. Acoustically, unlabeled +data are compared to labeled data using the Frechet audio distance, calculated +from embeddings generated by multiple audio encoders. Linguistically, large +language models are prompted to revise automatic speech recognition +transcriptions and predict labels based on our proposed task-specific +knowledge. High-confidence data are identified when pseudo-labels from both +sources align, while mismatches are treated as low-confidence data. A bimodal +classifier is then trained to iteratively label the low-confidence data until a +predefined criterion is met. We evaluate our SSL framework on emotion +recognition and dementia detection tasks. Experimental results demonstrate that +our method achieves competitive performance compared to fully supervised +learning using only 30% of the labeled data and significantly outperforms two +selected baselines. + +
+
+
+
+
+ + ☆ Cross-lingual Speech Emotion Recognition: Humans vs. Self-Supervised + Models + + +
+ Utilizing Self-Supervised Learning (SSL) models for Speech Emotion +Recognition (SER) has proven effective, yet limited research has explored +cross-lingual scenarios. This study presents a comparative analysis between +human performance and SSL models, beginning with a layer-wise analysis and an +exploration of parameter-efficient fine-tuning strategies in monolingual, +cross-lingual, and transfer learning contexts. We further compare the SER +ability of models and humans at both utterance- and segment-levels. +Additionally, we investigate the impact of dialect on cross-lingual SER through +human evaluation. Our findings reveal that models, with appropriate knowledge +transfer, can adapt to the target language and achieve performance comparable +to native speakers. We also demonstrate the significant effect of dialect on +SER for individuals without prior linguistic and paralinguistic background. +Moreover, both humans and models exhibit distinct behaviors across different +emotions. These results offer new insights into the cross-lingual SER +capabilities of SSL models, underscoring both their similarities to and +differences from human emotion perception. + +
+
+
+
+
+ + ☆ Zero-Shot Detection of LLM-Generated Text using Token Cohesiveness EMNLP 2024 + + +
+ The increasing capability and widespread usage of large language models +(LLMs) highlight the desirability of automatic detection of LLM-generated text. +Zero-shot detectors, due to their training-free nature, have received +considerable attention and notable success. In this paper, we identify a new +feature, token cohesiveness, that is useful for zero-shot detection, and we +demonstrate that LLM-generated text tends to exhibit higher token cohesiveness +than human-written text. Based on this observation, we devise TOCSIN, a generic +dual-channel detection paradigm that uses token cohesiveness as a plug-and-play +module to improve existing zero-shot detectors. To calculate token +cohesiveness, TOCSIN only requires a few rounds of random token deletion and +semantic difference measurement, making it particularly suitable for a +practical black-box setting where the source model used for generation is not +accessible. Extensive experiments with four state-of-the-art base detectors on +various datasets, source models, and evaluation settings demonstrate the +effectiveness and generality of the proposed approach. Code available at: +\url{https://github.com/Shixuan-Ma/TOCSIN}. + +
+
+ comment: To appear at the main conference of EMNLP 2024 +
+
+
+
+
+ + ☆ Pruning Multilingual Large Language Models for Multilingual Inference EMNLP 2024 + + +
+ Multilingual large language models (MLLMs), trained on multilingual balanced +data, demonstrate better zero-shot learning performance in non-English +languages compared to large language models trained on English-dominant data. +However, the disparity in performance between English and non-English languages +remains a challenge yet to be fully addressed. A distinctive characteristic of +MLLMs is their high-quality translation capabilities, indicating an acquired +proficiency in aligning between languages. This study explores how to enhance +the zero-shot performance of MLLMs in non-English languages by leveraging their +alignment capability between English and non-English languages. To achieve +this, we first analyze the behavior of MLLMs when performing translation and +reveal that there are large magnitude features that play a critical role in the +translation process. Inspired by these findings, we retain the weights +associated with operations involving the large magnitude features and prune +other weights to force MLLMs to rely on these features for tasks beyond +translation. We empirically demonstrate that this pruning strategy can enhance +the MLLMs' performance in non-English language. + +
+
+ comment: Accepted at EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Enhancing Temporal Sensitivity and Reasoning for Time-Sensitive Question + Answering EMNLP 2024 + + +
+ Time-Sensitive Question Answering (TSQA) demands the effective utilization of +specific temporal contexts, encompassing multiple time-evolving facts, to +address time-sensitive questions. This necessitates not only the parsing of +temporal information within questions but also the identification and +understanding of time-evolving facts to generate accurate answers. However, +current large language models still have limited sensitivity to temporal +information and their inadequate temporal reasoning capabilities.In this paper, +we propose a novel framework that enhances temporal awareness and reasoning +through Temporal Information-Aware Embedding and Granular Contrastive +Reinforcement Learning. Experimental results on four TSQA datasets demonstrate +that our framework significantly outperforms existing LLMs in TSQA tasks, +marking a step forward in bridging the performance gap between machine and +human temporal understanding and reasoning. + +
+
+ comment: Accepted by EMNLP 2024 Findings +
+
+
+
+
+ + ☆ A Roadmap for Embodied and Social Grounding in LLMs + + +
+ The fusion of Large Language Models (LLMs) and robotic systems has led to a +transformative paradigm in the robotic field, offering unparalleled +capabilities not only in the communication domain but also in skills like +multimodal input handling, high-level reasoning, and plan generation. The +grounding of LLMs knowledge into the empirical world has been considered a +crucial pathway to exploit the efficiency of LLMs in robotics. Nevertheless, +connecting LLMs' representations to the external world with multimodal +approaches or with robots' bodies is not enough to let them understand the +meaning of the language they are manipulating. Taking inspiration from humans, +this work draws attention to three necessary elements for an agent to grasp and +experience the world. The roadmap for LLMs grounding is envisaged in an active +bodily system as the reference point for experiencing the environment, a +temporally structured experience for a coherent, self-related interaction with +the external world, and social skills to acquire a common-grounded shared +experience. + +
+
+ comment: Accepted Version of a conference paper presented at Robophilosophy + Conference 2024 +
+
+
+
+
+ + ♻ ☆ RLHFuse: Efficient RLHF Training for Large Language Models with Inter- + and Intra-Stage Fusion + + +
+ Reinforcement Learning from Human Feedback (RLHF) enhances the alignment +between LLMs and human preference. The workflow of RLHF typically involves +several models and tasks in a series of distinct stages. Existing RLHF training +systems view each task as the smallest execution unit thus overlooking the +opportunities for subtask-level optimizations. Due to the intrinsic nature of +RLHF training, i.e., the data skewness in the generation stage, and the +pipeline bubbles in the training stage, existing RLHF systems suffer from low +GPU utilization in production deployments. + RLHFuse breaks the traditional view of RLHF workflow as a composition of +individual tasks, splitting each task into finer-grained subtasks, and +performing stage fusion to improve GPU utilization. RLHFuse contains two key +ideas. First, for generation and inference tasks, RLHFuse splits them into +sample-level subtasks, enabling efficient inter-stage fusion to mitigate the +original generation bottleneck dominated by long-tailed samples. Second, for +training tasks, RLHFuse breaks them into subtasks of micro-batches. By +leveraging the intuition that pipeline execution can be essentially +complemented by another pipeline, RLHFuse performs intra-stage fusion to +concurrently execute these subtasks in the training stage with a fused pipeline +schedule, resulting in fewer pipeline bubbles. In addition, RLHFuse +incorporates a series of system optimizations tailored for each stage of RLHF, +making it efficient and scalable for our internal product usage. We evaluate +RLHFuse on various popular LLMs and the results show that RLHFuse increases the +training throughput by up to 3.7x, compared to existing state-of-the-art +systems. + +
+
+
+
+
+ + ♻ ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: Make the last little additions to the draft +
+
+
+
+
+ + ♻ ☆ Holmes: A Benchmark to Assess the Linguistic Competence of Language + Models + + +
+ We introduce Holmes, a new benchmark designed to assess language models (LMs) +linguistic competence - their unconscious understanding of linguistic +phenomena. Specifically, we use classifier-based probing to examine LMs' +internal representations regarding distinct linguistic phenomena (e.g., +part-of-speech tagging). As a result, we meet recent calls to disentangle LMs' +linguistic competence from other cognitive abilities, such as following +instructions in prompting-based evaluations. Composing Holmes, we review over +270 probing studies and include more than 200 datasets to assess syntax, +morphology, semantics, reasoning, and discourse phenomena. Analyzing over 50 +LMs reveals that, aligned with known trends, their linguistic competence +correlates with model size. However, surprisingly, model architecture and +instruction tuning also significantly influence performance, particularly in +morphology and syntax. Finally, we propose FlashHolmes, a streamlined version +that reduces the computation load while maintaining high-ranking precision. + +
+
+
+
+
+ + ♻ ☆ In-Context Learning with Representations: Contextual Generalization of + Trained Transformers NeurIPS 2024 + + +
+ In-context learning (ICL) refers to a remarkable capability of pretrained +large language models, which can learn a new task given a few examples during +inference. However, theoretical understanding of ICL is largely under-explored, +particularly whether transformers can be trained to generalize to unseen +examples in a prompt, which will require the model to acquire contextual +knowledge of the prompt for generalization. This paper investigates the +training dynamics of transformers by gradient descent through the lens of +non-linear regression tasks. The contextual generalization here can be attained +via learning the template function for each task in-context, where all template +functions lie in a linear space with $m$ basis functions. We analyze the +training dynamics of one-layer multi-head transformers to in-contextly predict +unlabeled inputs given partially labeled prompts, where the labels contain +Gaussian noise and the number of examples in each prompt are not sufficient to +determine the template. Under mild assumptions, we show that the training loss +for a one-layer multi-head transformer converges linearly to a global minimum. +Moreover, the transformer effectively learns to perform ridge regression over +the basis functions. To our knowledge, this study is the first provable +demonstration that transformers can learn contextual (i.e., template) +information to generalize to both unseen examples and tasks when prompts +contain only a small number of query-answer pairs. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Spectra: A Comprehensive Study of Ternary, Quantized, and FP16 Language + Models + + +
+ Post-training quantization is the leading method for addressing +memory-related bottlenecks in LLM inference, but unfortunately, it suffers from +significant performance degradation below 4-bit precision. An alternative +approach involves training compressed models directly at a low bitwidth (e.g., +binary or ternary models). However, the performance, training dynamics, and +scaling trends of such models are not yet well understood. To address this +issue, we train and openly release the Spectra LLM suite consisting of 54 +language models ranging from 99M to 3.9B parameters, trained on 300B tokens. +Spectra includes FloatLMs, post-training quantized QuantLMs (3, 4, 6, and 8 +bits), and ternary LLMs (TriLMs) - our improved architecture for ternary +language modeling, which significantly outperforms previously proposed ternary +models of a given size (in bits), matching half-precision models at scale. For +example, TriLM 3.9B is (bit-wise) smaller than the half-precision FloatLM 830M, +but matches half-precision FloatLM 3.9B in commonsense reasoning and knowledge +benchmarks. However, TriLM 3.9B is also as toxic and stereotyping as FloatLM +3.9B, a model six times larger in size. Additionally, TriLM 3.9B lags behind +FloatLM in perplexity on validation splits and web-based corpora but performs +better on less noisy datasets like Lambada and PennTreeBank. + To enhance understanding of low-bitwidth models, we are releasing 500+ +intermediate checkpoints of the Spectra suite at +\href{https://github.com/NolanoOrg/SpectraSuite}{https://github.com/NolanoOrg/SpectraSuite}. + +
+
+ comment: 32 pages, 12 figures, and 10 tables +
+
+
+
+
+ + ♻ ☆ Asking an AI for salary negotiation advice is a matter of concern: + Controlled experimental perturbation of ChatGPT for protected and + non-protected group discrimination on a contextual task with no clear ground + truth answers + + +
+ We conducted controlled experimental bias audits for four versions of +ChatGPT, which we asked to recommend an opening offer in salary negotiations +for a new hire. We submitted 98,800 prompts to each version, systematically +varying the employee's gender, university, and major, and tested prompts in +voice of each side of the negotiation: the employee versus employer. We find +ChatGPT as a multi-model platform is not robust and consistent enough to be +trusted for such a task. We observed statistically significant salary offers +when varying gender for all four models, although with smaller gaps than for +other attributes tested. The largest gaps were different model versions and +between the employee- vs employer-voiced prompts. We also observed substantial +gaps when varying university and major, but many of the biases were not +consistent across model versions. We tested for fictional and fraudulent +universities and found wildly inconsistent results across cases and model +versions. We make broader contributions to the AI/ML fairness literature. Our +scenario and our experimental design differ from mainstream AI/ML auditing +efforts in key ways. Bias audits typically test discrimination for protected +classes like gender, which we contrast with testing non-protected classes of +university and major. Asking for negotiation advice includes how aggressive one +ought to be in a negotiation relative to known empirical salary distributions +and scales, which is a deeply contextual and personalized task that has no +objective ground truth to validate. These results raise concerns for the +specific model versions we tested and ChatGPT as a multi-model platform in +continuous development. Our epistemology does not permit us to definitively +certify these models as either generally biased or unbiased on the attributes +we test, but our study raises matters of concern for stakeholders to further +investigate. + +
+
+
+
+
+ + ♻ ☆ MMoE: Enhancing Multimodal Models with Mixtures of Multimodal + Interaction Experts + + +
+ Advances in multimodal models have greatly improved how interactions relevant +to various tasks are modeled. Today's multimodal models mainly focus on the +correspondence between images and text, using this for tasks like image-text +matching. However, this covers only a subset of real-world interactions. Novel +interactions, such as sarcasm expressed through opposing spoken words and +gestures or humor expressed through utterances and tone of voice, remain +challenging. In this paper, we introduce an approach to enhance multimodal +models, which we call Multimodal Mixtures of Experts (MMoE). The key idea in +MMoE is to train separate expert models for each type of multimodal +interaction, such as redundancy present in both modalities, uniqueness in one +modality, or synergy that emerges when both modalities are fused. On a sarcasm +detection task (MUStARD) and a humor detection task (URFUNNY), we obtain new +state-of-the-art results. MMoE is also able to be applied to various types of +models to gain improvement. + +
+
+
+
+
+ + ♻ ☆ Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image + Generation EMNLP 2024 + + +
+ Recent advances in image tokenizers, such as VQ-VAE, have enabled +text-to-image generation using auto-regressive methods, similar to language +modeling. However, these methods have yet to leverage pre-trained language +models, despite their adaptability to various downstream tasks. In this work, +we explore this gap by adapting a pre-trained language model for +auto-regressive text-to-image generation, and find that pre-trained language +models offer limited help. We provide a two-fold explanation by analyzing +tokens from each modality. First, we demonstrate that image tokens possess +significantly different semantics compared to text tokens, rendering +pre-trained language models no more effective in modeling them than randomly +initialized ones. Second, the text tokens in the image-text datasets are too +simple compared to normal language model pre-training data, which causes the +catastrophic degradation of language models' capability. + +
+
+ comment: Published at EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Benchmarking Cognitive Biases in Large Language Models as Evaluators ACL 2024 + + +
+ Large Language Models are cognitively biased judges. Large Language Models +(LLMs) have recently been shown to be effective as automatic evaluators with +simple prompting and in-context learning. In this work, we assemble 15 LLMs of +four different size ranges and evaluate their output responses by preference +ranking from the other LLMs as evaluators, such as System Star is better than +System Square. We then evaluate the quality of ranking outputs introducing the +Cognitive Bias Benchmark for LLMs as Evaluators (CoBBLEr), a benchmark to +measure six different cognitive biases in LLM evaluation outputs, such as the +Egocentric bias where a model prefers to rank its own outputs highly in +evaluation. We find that LLMs are biased text quality evaluators, exhibiting +strong indications on our bias benchmark (average of 40% of comparisons across +all models) within each of their evaluations that question their robustness as +evaluators. Furthermore, we examine the correlation between human and machine +preferences and calculate the average Rank-Biased Overlap (RBO) score to be +49.6%, indicating that machine preferences are misaligned with humans. +According to our findings, LLMs may still be unable to be utilized for +automatic annotation aligned with human preferences. Our project page is at: +https://minnesotanlp.github.io/cobbler. + +
+
+ comment: Publishsed at ACL 2024. 29 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Is This a Bad Table? A Closer Look at the Evaluation of Table Generation + from Text + + +
+ Understanding whether a generated table is of good quality is important to be +able to use it in creating or editing documents using automatic methods. In +this work, we underline that existing measures for table quality evaluation +fail to capture the overall semantics of the tables, and sometimes unfairly +penalize good tables and reward bad ones. We propose TabEval, a novel table +evaluation strategy that captures table semantics by first breaking down a +table into a list of natural language atomic statements and then compares them +with ground truth statements using entailment-based measures. To validate our +approach, we curate a dataset comprising of text descriptions for 1,250 diverse +Wikipedia tables, covering a range of topics and structures, in contrast to the +limited scope of existing datasets. We compare TabEval with existing metrics +using unsupervised and supervised text-to-table generation methods, +demonstrating its stronger correlation with human judgments of table quality +across four datasets. + +
+
+
+
+
+ + ♻ ☆ A is for Absorption: Studying Feature Splitting and Absorption in Sparse + Autoencoders + + +
+ Sparse Autoencoders (SAEs) have emerged as a promising approach to decompose +the activations of Large Language Models (LLMs) into human-interpretable +latents. In this paper, we pose two questions. First, to what extent do SAEs +extract monosemantic and interpretable latents? Second, to what extent does +varying the sparsity or the size of the SAE affect monosemanticity / +interpretability? By investigating these questions in the context of a simple +first-letter identification task where we have complete access to ground truth +labels for all tokens in the vocabulary, we are able to provide more detail +than prior investigations. Critically, we identify a problematic form of +feature-splitting we call feature absorption where seemingly monosemantic +latents fail to fire in cases where they clearly should. Our investigation +suggests that varying SAE size or sparsity is insufficient to solve this issue, +and that there are deeper conceptual issues in need of resolution. + +
+
+
+
+
+ + ♻ ☆ Ranking Manipulation for Conversational Search Engines + + +
+ Major search engine providers are rapidly incorporating Large Language Model +(LLM)-generated content in response to user queries. These conversational +search engines operate by loading retrieved website text into the LLM context +for summarization and interpretation. Recent research demonstrates that LLMs +are highly vulnerable to jailbreaking and prompt injection attacks, which +disrupt the safety and quality goals of LLMs using adversarial strings. This +work investigates the impact of prompt injections on the ranking order of +sources referenced by conversational search engines. To this end, we introduce +a focused dataset of real-world consumer product websites and formalize +conversational search ranking as an adversarial problem. Experimentally, we +analyze conversational search rankings in the absence of adversarial injections +and show that different LLMs vary significantly in prioritizing product name, +document content, and context position. We then present a tree-of-attacks-based +jailbreaking technique which reliably promotes low-ranked products. +Importantly, these attacks transfer effectively to state-of-the-art +conversational search engines such as perplexity$.$ai. Given the strong +financial incentive for website owners to boost their search ranking, we argue +that our problem formulation is of critical importance for future robustness +work. + +
+
+ comment: 2024 Conference on Empirical Methods in Natural Language Processing + (Main) +
+
+
+
+
+ + ♻ ☆ Towards Trustworthy Reranking: A Simple yet Effective Abstention + Mechanism + + +
+ Neural Information Retrieval (NIR) has significantly improved upon +heuristic-based Information Retrieval (IR) systems. Yet, failures remain +frequent, the models used often being unable to retrieve documents relevant to +the user's query. We address this challenge by proposing a lightweight +abstention mechanism tailored for real-world constraints, with particular +emphasis placed on the reranking phase. We introduce a protocol for evaluating +abstention strategies in black-box scenarios (typically encountered when +relying on API services), demonstrating their efficacy, and propose a simple +yet effective data-driven mechanism. We provide open-source code for experiment +replication and abstention implementation, fostering wider adoption and +application in diverse contexts. + +
+
+
+
+
+ + ♻ ☆ Keeping Up with the Language Models: Systematic Benchmark Extension for + Bias Auditing + + +
+ Bias auditing of language models (LMs) has received considerable attention as +LMs are becoming widespread. As such, several benchmarks for bias auditing have +been proposed. At the same time, the rapid evolution of LMs can make these +benchmarks irrelevant in no time. Bias auditing is further complicated by LM +brittleness: when a presumably biased outcome is observed, is it due to model +bias or model brittleness? We propose enlisting the models themselves to help +construct bias auditing datasets that remain challenging, and introduce bias +measures that distinguish between different types of model errors. First, we +extend an existing bias benchmark for NLI (BBNLI) using a combination of +LM-generated lexical variations, adversarial filtering, and human validation. +We demonstrate that the newly created dataset BBNLI-next is more challenging +than BBNLI: on average, BBNLI-next reduces the accuracy of state-of-the-art NLI +models from 95.3%, as observed by BBNLI, to a strikingly low 57.5%. Second, we +employ BBNLI-next to showcase the interplay between robustness and bias: we +point out shortcomings in current bias scores and propose bias measures that +take into account both bias and model brittleness. Third, despite the fact that +BBNLI-next was designed with non-generative models in mind, we show that the +new dataset is also able to uncover bias in state-of-the-art open-source +generative LMs. + Note: All datasets included in this work are in English and they address +US-centered social biases. In the spirit of efficient NLP research, no model +training or fine-tuning was performed to conduct this research. + Warning: This paper contains offensive text examples. + +
+
+
+
+
+ + ♻ ☆ Robust Interaction-Based Relevance Modeling for Online e-Commerce Search ECML-PKDD'24 + + +
+ Semantic relevance calculation is crucial for e-commerce search engines, as +it ensures that the items selected closely align with customer intent. +Inadequate attention to this aspect can detrimentally affect user experience +and engagement. Traditional text-matching techniques are prevalent but often +fail to capture the nuances of search intent accurately, so neural networks now +have become a preferred solution to processing such complex text matching. +Existing methods predominantly employ representation-based architectures, which +strike a balance between high traffic capacity and low latency. However, they +exhibit significant shortcomings in generalization and robustness when compared +to interaction-based architectures. In this work, we introduce a robust +interaction-based modeling paradigm to address these shortcomings. It +encompasses 1) a dynamic length representation scheme for expedited inference, +2) a professional terms recognition method to identify subjects and core +attributes from complex sentence structures, and 3) a contrastive adversarial +training protocol to bolster the model's robustness and matching capabilities. +Extensive offline evaluations demonstrate the superior robustness and +effectiveness of our approach, and online A/B testing confirms its ability to +improve relevance in the same exposure position, resulting in more clicks and +conversions. To the best of our knowledge, this method is the first +interaction-based approach for large e-commerce search relevance calculation. +Notably, we have deployed it for the entire search traffic on alibaba.com, the +largest B2B e-commerce platform in the world. + +
+
+ comment: Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7 + tables +
+
+
+
+
+ + ♻ ☆ HuatuoGPT-Vision, Towards Injecting Medical Visual Knowledge into + Multimodal LLMs at Scale + + +
+ The rapid development of multimodal large language models (MLLMs), such as +GPT-4V, has led to significant advancements. However, these models still face +challenges in medical multimodal capabilities due to limitations in the +quantity and quality of medical vision-text data, stemming from data privacy +concerns and high annotation costs. While pioneering approaches utilize +PubMed's large-scale, de-identified medical image-text pairs to address these +limitations, they still fall short due to inherent data noise. To tackle this, +we refined medical image-text pairs from PubMed and employed MLLMs (GPT-4V) in +an 'unblinded' capacity to denoise and reformat the data, resulting in the +creation of the PubMedVision dataset with 1.3 million medical VQA samples. Our +validation demonstrates that: (1) PubMedVision can significantly enhance the +medical multimodal capabilities of current MLLMs, showing significant +improvement in benchmarks including the MMMU Health & Medicine track; (2) +manual checks by medical experts and empirical results validate the superior +data quality of our dataset compared to other data construction methods. Using +PubMedVision, we train a 34B medical MLLM HuatuoGPT-Vision, which shows +superior performance in medical multimodal scenarios among open-source MLLMs. + +
+
+
+
+
+
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ Results of the Big ANN: NeurIPS'23 competition + + +
+ The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the +state-of-the-art in indexing data structures and search algorithms for +practical variants of Approximate Nearest Neighbor (ANN) search that reflect +the growing complexity and diversity of workloads. Unlike prior challenges that +emphasized scaling up classical ANN search +~\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competition addressed filtered +search, out-of-distribution data, sparse and streaming variants of ANNS. +Participants developed and submitted innovative solutions that were evaluated +on new standard datasets with constrained computational resources. The results +showcased significant improvements in search accuracy and efficiency over +industry-standard baselines, with notable contributions from both academic and +industrial teams. This paper summarizes the competition tracks, datasets, +evaluation metrics, and the innovative approaches of the top-performing +submissions, providing insights into the current advancements and future +directions in the field of approximate nearest neighbor search. + +
+
+ comment: Code: + https://github.com/harsha-simhadri/big-ann-benchmarks/releases/tag/v0.3.0 +
+
+
+
+
+ + ☆ Enhancing Recommendation with Denoising Auxiliary Task + + +
+ The historical interaction sequences of users plays a crucial role in +training recommender systems that can accurately predict user preferences. +However, due to the arbitrariness of user behavior, the presence of noise in +these sequences poses a challenge to predicting their next actions in +recommender systems. To address this issue, our motivation is based on the +observation that training noisy sequences and clean sequences (sequences +without noise) with equal weights can impact the performance of the model. We +propose a novel self-supervised Auxiliary Task Joint Training (ATJT) method +aimed at more accurately reweighting noisy sequences in recommender systems. +Specifically, we strategically select subsets from users' original sequences +and perform random replacements to generate artificially replaced noisy +sequences. Subsequently, we perform joint training on these artificially +replaced noisy sequences and the original sequences. Through effective +reweighting, we incorporate the training results of the noise recognition model +into the recommender model. We evaluate our method on three datasets using a +consistent base model. Experimental results demonstrate the effectiveness of +introducing self-supervised auxiliary task to enhance the base model's +performance. + +
+
+
+
+
+ + ☆ VectorSearch: Enhancing Document Retrieval with Semantic Embeddings and + Optimized Search + + +
+ Traditional retrieval methods have been essential for assessing document +similarity but struggle with capturing semantic nuances. Despite advancements +in latent semantic analysis (LSA) and deep learning, achieving comprehensive +semantic understanding and accurate retrieval remains challenging due to high +dimensionality and semantic gaps. The above challenges call for new techniques +to effectively reduce the dimensions and close the semantic gaps. To this end, +we propose VectorSearch, which leverages advanced algorithms, embeddings, and +indexing techniques for refined retrieval. By utilizing innovative multi-vector +search operations and encoding searches with advanced language models, our +approach significantly improves retrieval accuracy. Experiments on real-world +datasets show that VectorSearch outperforms baseline metrics, demonstrating its +efficacy for large-scale retrieval tasks. + +
+
+ comment: 10 pages, 14 figures +
+
+
+
+
+ + ☆ Spacewalker: Traversing Representation Spaces for Fast Interactive + Exploration and Annotation of Unstructured Data + + +
+ Unstructured data in industries such as healthcare, finance, and +manufacturing presents significant challenges for efficient analysis and +decision making. Detecting patterns within this data and understanding their +impact is critical but complex without the right tools. Traditionally, these +tasks relied on the expertise of data analysts or labor-intensive manual +reviews. In response, we introduce Spacewalker, an interactive tool designed to +explore and annotate data across multiple modalities. Spacewalker allows users +to extract data representations and visualize them in low-dimensional spaces, +enabling the detection of semantic similarities. Through extensive user +studies, we assess Spacewalker's effectiveness in data annotation and integrity +verification. Results show that the tool's ability to traverse latent spaces +and perform multi-modal queries significantly enhances the user's capacity to +quickly identify relevant data. Moreover, Spacewalker allows for annotation +speed-ups far superior to conventional methods, making it a promising tool for +efficiently navigating unstructured data and improving decision making +processes. The code of this work is open-source and can be found at: +https://github.com/code-lukas/Spacewalker + +
+
+
+
+
+ + ☆ Enhancing Automatic Keyphrase Labelling with Text-to-Text Transfer + Transformer (T5) Architecture: A Framework for Keyphrase Generation and + Filtering + + +
+ Automatic keyphrase labelling stands for the ability of models to retrieve +words or short phrases that adequately describe documents' content. Previous +work has put much effort into exploring extractive techniques to address this +task; however, these methods cannot produce keyphrases not found in the text. +Given this limitation, keyphrase generation approaches have arisen lately. This +paper presents a keyphrase generation model based on the Text-to-Text Transfer +Transformer (T5) architecture. Having a document's title and abstract as input, +we learn a T5 model to generate keyphrases which adequately define its content. +We name this model docT5keywords. We not only perform the classic inference +approach, where the output sequence is directly selected as the predicted +values, but we also report results from a majority voting approach. In this +approach, multiple sequences are generated, and the keyphrases are ranked based +on their frequency of occurrence across these sequences. Along with this model, +we present a novel keyphrase filtering technique based on the T5 architecture. +We train a T5 model to learn whether a given keyphrase is relevant to a +document. We devise two evaluation methodologies to prove our model's +capability to filter inadequate keyphrases. First, we perform a binary +evaluation where our model has to predict if a keyphrase is relevant for a +given document. Second, we filter the predicted keyphrases by several AKG +models and check if the evaluation scores are improved. Experimental results +demonstrate that our keyphrase generation model significantly outperforms all +the baselines, with gains exceeding 100\% in some cases. The proposed filtering +technique also achieves near-perfect accuracy in eliminating false positives +across all datasets. + +
+
+
+
+
+ + ☆ A Prompting-Based Representation Learning Method for Recommendation with + Large Language Models + + +
+ In recent years, Recommender Systems (RS) have witnessed a transformative +shift with the advent of Large Language Models (LLMs) in the field of Natural +Language Processing (NLP). Models such as GPT-3.5/4, Llama, have demonstrated +unprecedented capabilities in understanding and generating human-like text. The +extensive information pre-trained by these LLMs allows for the potential to +capture a more profound semantic representation from different contextual +information of users and items. + While the great potential lies behind the thriving of LLMs, the challenge of +leveraging user-item preferences from contextual information and its alignment +with the improvement of Recommender Systems needs to be addressed. Believing +that a better understanding of the user or item itself can be the key factor in +improving recommendation performance, we conduct research on generating +informative profiles using state-of-the-art LLMs. + To boost the linguistic abilities of LLMs in Recommender Systems, we +introduce the Prompting-Based Representation Learning Method for Recommendation +(P4R). In our P4R framework, we utilize the LLM prompting strategy to create +personalized item profiles. These profiles are then transformed into semantic +representation spaces using a pre-trained BERT model for text embedding. +Furthermore, we incorporate a Graph Convolution Network (GCN) for collaborative +filtering representation. The P4R framework aligns these two embedding spaces +in order to address the general recommendation tasks. In our evaluation, we +compare P4R with state-of-the-art Recommender models and assess the quality of +prompt-based profile generation. + +
+
+ comment: Risks: The 1st International Workshop on Risks, Opportunities, and + Evaluation of Generative Models in Recommendation +
+
+
+
+
+ + ☆ PIFS-Rec: Process-In-Fabric-Switch for Large-Scale Recommendation System + Inferences + + +
+ Deep Learning Recommendation Models (DLRMs) have become increasingly popular +and prevalent in today's datacenters, consuming most of the AI inference +cycles. The performance of DLRMs is heavily influenced by available bandwidth +due to their large vector sizes in embedding tables and concurrent accesses. To +achieve substantial improvements over existing solutions, novel approaches +towards DLRM optimization are needed, especially, in the context of emerging +interconnect technologies like CXL. This study delves into exploring +CXL-enabled systems, implementing a process-in-fabric-switch (PIFS) solution to +accelerate DLRMs while optimizing their memory and bandwidth scalability. We +present an in-depth characterization of industry-scale DLRM workloads running +on CXL-ready systems, identifying the predominant bottlenecks in existing CXL +systems. We, therefore, propose PIFS-Rec, a PIFS-based scheme that implements +near-data processing through downstream ports of the fabric switch. PIFS-Rec +achieves a latency that is 3.89x lower than Pond, an industry-standard +CXL-based system, and also outperforms BEACON, a state-of-the-art scheme, by +2.03x. + +
+
+
+
+
+ + ☆ Train Once, Deploy Anywhere: Matryoshka Representation Learning for + Multimodal Recommendation EMNLP 2024 + + +
+ Despite recent advancements in language and vision modeling, integrating rich +multimodal knowledge into recommender systems continues to pose significant +challenges. This is primarily due to the need for efficient recommendation, +which requires adaptive and interactive responses. In this study, we focus on +sequential recommendation and introduce a lightweight framework called +full-scale Matryoshka representation learning for multimodal recommendation +(fMRLRec). Our fMRLRec captures item features at different granularities, +learning informative representations for efficient recommendation across +multiple dimensions. To integrate item features from diverse modalities, +fMRLRec employs a simple mapping to project multimodal item features into an +aligned feature space. Additionally, we design an efficient linear +transformation that embeds smaller features into larger ones, substantially +reducing memory requirements for large-scale training on recommendation data. +Combined with improved state space modeling techniques, fMRLRec scales to +different dimensions and only requires one-time training to produce multiple +models tailored to various granularities. We demonstrate the effectiveness and +efficiency of fMRLRec on multiple benchmark datasets, which consistently +achieves superior performance over state-of-the-art baseline methods. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ☆ Evaluating and Enhancing Large Language Models for Novelty Assessment in + Scholarly Publications + + +
+ Recent studies have evaluated the creativity/novelty of large language models +(LLMs) primarily from a semantic perspective, using benchmarks from cognitive +science. However, accessing the novelty in scholarly publications is a largely +unexplored area in evaluating LLMs. In this paper, we introduce a scholarly +novelty benchmark (SchNovel) to evaluate LLMs' ability to assess novelty in +scholarly papers. SchNovel consists of 15000 pairs of papers across six fields +sampled from the arXiv dataset with publication dates spanning 2 to 10 years +apart. In each pair, the more recently published paper is assumed to be more +novel. Additionally, we propose RAG-Novelty, which simulates the review process +taken by human reviewers by leveraging the retrieval of similar papers to +assess novelty. Extensive experiments provide insights into the capabilities of +different LLMs to assess novelty and demonstrate that RAG-Novelty outperforms +recent baseline models. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Generative Pre-trained Ranking Model with Over-parameterization at + Web-Scale (Extended Abstract) + + +
+ Learning to rank (LTR) is widely employed in web searches to prioritize +pertinent webpages from retrieved content based on input queries. However, +traditional LTR models encounter two principal obstacles that lead to +suboptimal performance: (1) the lack of well-annotated query-webpage pairs with +ranking scores covering a diverse range of search query popularities, which +hampers their ability to address queries across the popularity spectrum, and +(2) inadequately trained models that fail to induce generalized representations +for LTR, resulting in overfitting. To address these challenges, we propose a +\emph{\uline{G}enerative \uline{S}emi-\uline{S}upervised \uline{P}re-trained} +(GS2P) LTR model. We conduct extensive offline experiments on both a publicly +available dataset and a real-world dataset collected from a large-scale search +engine. Furthermore, we deploy GS2P in a large-scale web search engine with +realistic traffic, where we observe significant improvements in the real-world +application. + +
+
+
+
+
+ + ☆ Pre-trained Graphformer-based Ranking at Web-scale Search (Extended + Abstract) + + +
+ Both Transformer and Graph Neural Networks (GNNs) have been employed in the +domain of learning to rank (LTR). However, these approaches adhere to two +distinct yet complementary problem formulations: ranking score regression based +on query-webpage pairs, and link prediction within query-webpage bipartite +graphs, respectively. While it is possible to pre-train GNNs or Transformers on +source datasets and subsequently fine-tune them on sparsely annotated LTR +datasets, the distributional shifts between the pair-based and bipartite graph +domains present significant challenges in integrating these heterogeneous +models into a unified LTR framework at web scale. To address this, we introduce +the novel MPGraf model, which leverages a modular and capsule-based +pre-training strategy, aiming to cohesively integrate the regression +capabilities of Transformers with the link prediction strengths of GNNs. We +conduct extensive offline and online experiments to rigorously evaluate the +performance of MPGraf. + +
+
+
+
+
+ + ☆ FusionANNS: An Efficient CPU/GPU Cooperative Processing Architecture for + Billion-scale Approximate Nearest Neighbor Search + + +
+ Approximate nearest neighbor search (ANNS) has emerged as a crucial component +of database and AI infrastructure. Ever-increasing vector datasets pose +significant challenges in terms of performance, cost, and accuracy for ANNS +services. None of modern ANNS systems can address these issues simultaneously. +We present FusionANNS, a high-throughput, low-latency, cost-efficient, and +high-accuracy ANNS system for billion-scale datasets using SSDs and only one +entry-level GPU. The key idea of FusionANNS lies in CPU/GPU collaborative +filtering and re-ranking mechanisms, which significantly reduce I/O operations +across CPUs, GPU, and SSDs to break through the I/O performance bottleneck. +Specifically, we propose three novel designs: (1) multi-tiered indexing to +avoid data swapping between CPUs and GPU, (2) heuristic re-ranking to eliminate +unnecessary I/Os and computations while guaranteeing high accuracy, and (3) +redundant-aware I/O deduplication to further improve I/O efficiency. We +implement FusionANNS and compare it with the state-of-the-art SSD-based ANNS +system--SPANN and GPU-accelerated in-memory ANNS system--RUMMY. Experimental +results show that FusionANNS achieves 1) 9.4-13.1X higher query per second +(QPS) and 5.7-8.8X higher cost efficiency compared with SPANN; 2) and 2-4.9X +higher QPS and 2.3-6.8X higher cost efficiency compared with RUMMY, while +guaranteeing low latency and high accuracy. + +
+
+ comment: 15 pages, 26 figures +
+
+
+
+
+ + ♻ ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: Make the last little additions to the draft +
+
+
+
+
+ + ♻ ☆ Unified Embedding Based Personalized Retrieval in Etsy Search + + +
+ Embedding-based neural retrieval is a prevalent approach to address the +semantic gap problem which often arises in product search on tail queries. In +contrast, popular queries typically lack context and have a broad intent where +additional context from users historical interaction can be helpful. In this +paper, we share our novel approach to address both: the semantic gap problem +followed by an end to end trained model for personalized semantic retrieval. We +propose learning a unified embedding model incorporating graph, transformer and +term-based embeddings end to end and share our design choices for optimal +tradeoff between performance and efficiency. We share our learnings in feature +engineering, hard negative sampling strategy, and application of transformer +model, including a novel pre-training strategy and other tricks for improving +search relevance and deploying such a model at industry scale. Our personalized +retrieval model significantly improves the overall search experience, as +measured by a 5.58% increase in search purchase rate and a 2.63% increase in +site-wide conversion rate, aggregated across multiple A/B tests - on live +traffic. + +
+
+ comment: To appear at FMLDS 2024 +
+
+
+
+
+ + ♻ ☆ Towards Trustworthy Reranking: A Simple yet Effective Abstention + Mechanism + + +
+ Neural Information Retrieval (NIR) has significantly improved upon +heuristic-based Information Retrieval (IR) systems. Yet, failures remain +frequent, the models used often being unable to retrieve documents relevant to +the user's query. We address this challenge by proposing a lightweight +abstention mechanism tailored for real-world constraints, with particular +emphasis placed on the reranking phase. We introduce a protocol for evaluating +abstention strategies in black-box scenarios (typically encountered when +relying on API services), demonstrating their efficacy, and propose a simple +yet effective data-driven mechanism. We provide open-source code for experiment +replication and abstention implementation, fostering wider adoption and +application in diverse contexts. + +
+
+
+
+
+ + ♻ ☆ Robust Interaction-Based Relevance Modeling for Online e-Commerce Search ECML-PKDD'24 + + +
+ Semantic relevance calculation is crucial for e-commerce search engines, as +it ensures that the items selected closely align with customer intent. +Inadequate attention to this aspect can detrimentally affect user experience +and engagement. Traditional text-matching techniques are prevalent but often +fail to capture the nuances of search intent accurately, so neural networks now +have become a preferred solution to processing such complex text matching. +Existing methods predominantly employ representation-based architectures, which +strike a balance between high traffic capacity and low latency. However, they +exhibit significant shortcomings in generalization and robustness when compared +to interaction-based architectures. In this work, we introduce a robust +interaction-based modeling paradigm to address these shortcomings. It +encompasses 1) a dynamic length representation scheme for expedited inference, +2) a professional terms recognition method to identify subjects and core +attributes from complex sentence structures, and 3) a contrastive adversarial +training protocol to bolster the model's robustness and matching capabilities. +Extensive offline evaluations demonstrate the superior robustness and +effectiveness of our approach, and online A/B testing confirms its ability to +improve relevance in the same exposure position, resulting in more clicks and +conversions. To the best of our knowledge, this method is the first +interaction-based approach for large e-commerce search relevance calculation. +Notably, we have deployed it for the entire search traffic on alibaba.com, the +largest B2B e-commerce platform in the world. + +
+
+ comment: Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7 + tables +
+
+
+
+
+ + ♻ ☆ A Personality-Guided Preference Aggregator for Ephemeral Group + Recommendation + + +
+ Ephemeral group recommendation (EGR) aims to suggest items for a group of +users who come together for the first time. Existing work typically consider +individual preferences as the sole factor in aggregating group preferences. +However, they neglect to take into account the importance of the individual +inherent factors, such as personality, and thus fail to accurately simulate the +group decision-making process. Additionally, these methods often struggle due +to insufficient interactive records. To tackle these issues, a +Personality-Guided Preference Aggregator (PEGA) is proposed, which guides the +preference aggregation of group members based on their personalities, rather +than relying solely on their preferences. Specifically, implicit personalities +are first extracted from user reviews. Hyper-rectangles are then used to +aggregate individual personalities to obtain the "Group Personality", which +allows for the learning of personality distributions within the group. +Subsequently, a personality attention mechanism is employed to aggregate group +preferences, and a preference-based fine-tuning module is used to balance the +weights of personality and preferences. The role of personality in this +approach is twofold: (1) To estimate the importance of individual users in a +group and provide explainability; (2) To alleviate the data sparsity issue +encountered in ephemeral groups. Experimental results demonstrate that, on four +real-world datasets, the PEGA model significantly outperforms related baseline +models in terms of classification accuracy and interpretability. Moreover, +empirical evidence supports the idea that personality plays a pivotal role in +enhancing the performance of EGR tasks. + +
+
+
+
+
+ + ♻ ☆ DemoRank: Selecting Effective Demonstrations for Large Language Models + in Ranking Task + + +
+ Recently, there has been increasing interest in applying large language +models (LLMs) as zero-shot passage rankers. However, few studies have explored +how to select appropriate in-context demonstrations for the passage ranking +task, which is the focus of this paper. Previous studies mainly use LLM's +feedback to train a retriever for demonstration selection. These studies apply +the LLM to score each demonstration independently, which ignores the +dependencies between demonstrations (especially important in ranking task), +leading to inferior performance of top-$k$ retrieved demonstrations. To +mitigate this issue, we introduce a demonstration reranker to rerank the +retrieved demonstrations so that top-$k$ ranked ones are more suitable for ICL. +However, generating training data for such reranker is quite challenging. On +the one hand, different from demonstration retriever, the training samples of +reranker need to incorporate demonstration dependencies. On the other hand, +obtaining the gold ranking from the retrieved demonstrations is an NP-hard +problem, which is hard to implement. To overcome these challenges, we propose a +method to approximate the optimal demonstration list iteratively and utilize +LLM to score demonstration lists of varying lengths. By doing so, the search +space is greatly reduced and demonstration dependencies are considered. Based +on these scored demonstration lists, we further design a list-pairwise training +approach which compares a pair of lists that only differ in the last +demonstration, to teach the reranker how to select the next demonstration given +a previous sequence. In this paper, we propose a demonstration selection +framework DemoRank for ranking task and conduct extensive experiments to prove +its strong ability. + +
+
+
+
+
+ + ♻ ☆ A Study of Implicit Ranking Unfairness in Large Language Models EMNLP 2024 + + +
+ Recently, Large Language Models (LLMs) have demonstrated a superior ability +to serve as ranking models. However, concerns have arisen as LLMs will exhibit +discriminatory ranking behaviors based on users' sensitive attributes (\eg +gender). Worse still, in this paper, we identify a subtler form of +discrimination in LLMs, termed \textit{implicit ranking unfairness}, where LLMs +exhibit discriminatory ranking patterns based solely on non-sensitive user +profiles, such as user names. Such implicit unfairness is more widespread but +less noticeable, threatening the ethical foundation. To comprehensively explore +such unfairness, our analysis will focus on three research aspects: (1) We +propose an evaluation method to investigate the severity of implicit ranking +unfairness. (2) We uncover the reasons for causing such unfairness. (3) To +mitigate such unfairness effectively, we utilize a pair-wise regression method +to conduct fair-aware data augmentation for LLM fine-tuning. The experiment +demonstrates that our method outperforms existing approaches in ranking +fairness, achieving this with only a small reduction in accuracy. Lastly, we +emphasize the need for the community to identify and mitigate the implicit +unfairness, aiming to avert the potential deterioration in the reinforced +human-LLMs ecosystem deterioration. + +
+
+ comment: Accepted in EMNLP 2024 findings +
+
+
+
+
+ + ♻ ☆ Towards a Realistic Long-Term Benchmark for Open-Web Research Agents + + +
+ We present initial results of a forthcoming benchmark for evaluating LLM +agents on white-collar tasks of economic value. We evaluate agents on +real-world "messy" open-web research tasks of the type that are routine in +finance and consulting. In doing so, we lay the groundwork for an LLM agent +evaluation suite where good performance directly corresponds to a large +economic and societal impact. We built and tested several agent architectures +with o1-preview, GPT-4o, Claude-3.5 Sonnet, Llama 3.1 (405b), and GPT-4o-mini. +On average, LLM agents powered by Claude-3.5 Sonnet and o1-preview +substantially outperformed agents using GPT-4o, with agents based on Llama 3.1 +(405b) and GPT-4o-mini lagging noticeably behind. Across LLMs, a ReAct +architecture with the ability to delegate subtasks to subagents performed best. +In addition to quantitative evaluations, we qualitatively assessed the +performance of the LLM agents by inspecting their traces and reflecting on +their observations. Our evaluation represents the first in-depth assessment of +agents' abilities to conduct challenging, economically valuable analyst-style +research on the real open web. + +
+
+
+
+
+ + ♻ ☆ ChatDiet: Empowering Personalized Nutrition-Oriented Food Recommender + Chatbots through an LLM-Augmented Framework + + +
+ The profound impact of food on health necessitates advanced +nutrition-oriented food recommendation services. Conventional methods often +lack the crucial elements of personalization, explainability, and +interactivity. While Large Language Models (LLMs) bring interpretability and +explainability, their standalone use falls short of achieving true +personalization. In this paper, we introduce ChatDiet, a novel LLM-powered +framework designed specifically for personalized nutrition-oriented food +recommendation chatbots. ChatDiet integrates personal and population models, +complemented by an orchestrator, to seamlessly retrieve and process pertinent +information. The personal model leverages causal discovery and inference +techniques to assess personalized nutritional effects for a specific user, +whereas the population model provides generalized information on food +nutritional content. The orchestrator retrieves, synergizes and delivers the +output of both models to the LLM, providing tailored food recommendations +designed to support targeted health outcomes. The result is a dynamic delivery +of personalized and explainable food recommendations, tailored to individual +user preferences. Our evaluation of ChatDiet includes a compelling case study, +where we establish a causal personal model to estimate individual nutrition +effects. Our assessments, including a food recommendation test showcasing a +92\% effectiveness rate, coupled with illustrative dialogue examples, +underscore ChatDiet's strengths in explainability, personalization, and +interactivity. + +
+
+ comment: Published on Smart Health +
+
+
+
+
+ + ♻ ☆ Object-Aware Query Perturbation for Cross-Modal Image-Text Retrieval ECCV 2024 + + +
+ The pre-trained vision and language (V\&L) models have substantially improved +the performance of cross-modal image-text retrieval. In general, however, V\&L +models have limited retrieval performance for small objects because of the +rough alignment between words and the small objects in the image. In contrast, +it is known that human cognition is object-centric, and we pay more attention +to important objects, even if they are small. To bridge this gap between the +human cognition and the V\&L model's capability, we propose a cross-modal +image-text retrieval framework based on ``object-aware query perturbation.'' +The proposed method generates a key feature subspace of the detected objects +and perturbs the corresponding queries using this subspace to improve the +object awareness in the image. In our proposed method, object-aware cross-modal +image-text retrieval is possible while keeping the rich expressive power and +retrieval performance of existing V\&L models without additional fine-tuning. +Comprehensive experiments on four public datasets show that our method +outperforms conventional algorithms. Our code is publicly available at +\url{https://github.com/NEC-N-SOGI/query-perturbation}. + +
+
+ comment: ECCV 2024. Code: https://github.com/NEC-N-SOGI/query-perturbation +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ AIM 2024 Challenge on Efficient Video Super-Resolution for AV1 + Compressed Content ECCV + + +
+ Video super-resolution (VSR) is a critical task for enhancing low-bitrate and +low-resolution videos, particularly in streaming applications. While numerous +solutions have been developed, they often suffer from high computational +demands, resulting in low frame rates (FPS) and poor power efficiency, +especially on mobile platforms. In this work, we compile different methods to +address these challenges, the solutions are end-to-end real-time video +super-resolution frameworks optimized for both high performance and low +runtime. We also introduce a new test set of high-quality 4K videos to further +validate the approaches. The proposed solutions tackle video up-scaling for two +applications: 540p to 4K (x4) as a general case, and 360p to 1080p (x3) more +tailored towards mobile devices. In both tracks, the solutions have a reduced +number of parameters and operations (MACs), allow high FPS, and improve VMAF +and PSNR over interpolation baselines. This report gauges some of the most +efficient video super-resolution methods to date. + +
+
+ comment: European Conference on Computer Vision (ECCV) 2024 - Advances in + Image Manipulation (AIM) +
+
+
+
+
+ + ☆ Language-oriented Semantic Communication for Image Transmission with + Fine-Tuned Diffusion Model SP + + +
+ Ubiquitous image transmission in emerging applications brings huge overheads +to limited wireless resources. Since that text has the characteristic of +conveying a large amount of information with very little data, the transmission +of the descriptive text of an image can reduce the amount of transmitted data. +In this context, this paper develops a novel semantic communication framework +based on a text-2-image generative model (Gen-SC). In particular, a transmitter +converts the input image to textual modality data. Then the text is transmitted +through a noisy channel to the receiver. The receiver then uses the received +text to generate images. Additionally, to improve the robustness of text +transmission over noisy channels, we designed a transformer-based text +transmission codec model. Moreover, we obtained a personalized knowledge base +by fine-tuning the diffusion model to meet the requirements of task-oriented +transmission scenarios. Simulation results show that the proposed framework can +achieve high perceptual quality with reducing the transmitted data volume by up +to 99% and is robust to wireless channel noise in terms of portrait image +transmission. + +
+
+ comment: 6 pages, 9 figures, accepted by Wireless Communications and Signal + Processing (WCSP) 2024 +
+
+
+
+
+ + ☆ Semi-Supervised Cognitive State Classification from Speech with + Multi-View Pseudo-Labeling + + +
+ The lack of labeled data is a common challenge in speech classification +tasks, particularly those requiring extensive subjective assessment, such as +cognitive state classification. In this work, we propose a Semi-Supervised +Learning (SSL) framework, introducing a novel multi-view pseudo-labeling method +that leverages both acoustic and linguistic characteristics to select the most +confident data for training the classification model. Acoustically, unlabeled +data are compared to labeled data using the Frechet audio distance, calculated +from embeddings generated by multiple audio encoders. Linguistically, large +language models are prompted to revise automatic speech recognition +transcriptions and predict labels based on our proposed task-specific +knowledge. High-confidence data are identified when pseudo-labels from both +sources align, while mismatches are treated as low-confidence data. A bimodal +classifier is then trained to iteratively label the low-confidence data until a +predefined criterion is met. We evaluate our SSL framework on emotion +recognition and dementia detection tasks. Experimental results demonstrate that +our method achieves competitive performance compared to fully supervised +learning using only 30% of the labeled data and significantly outperforms two +selected baselines. + +
+
+
+
+
+ + ♻ ☆ Navigating Weight Prediction with Diet Diary ACM MM'24 + + +
+ Current research in food analysis primarily concentrates on tasks such as +food recognition, recipe retrieval and nutrition estimation from a single +image. Nevertheless, there is a significant gap in exploring the impact of food +intake on physiological indicators (e.g., weight) over time. This paper +addresses this gap by introducing the DietDiary dataset, which encompasses +daily dietary diaries and corresponding weight measurements of real users. +Furthermore, we propose a novel task of weight prediction with a dietary diary +that aims to leverage historical food intake and weight to predict future +weights. To tackle this task, we propose a model-agnostic time series +forecasting framework. Specifically, we introduce a Unified Meal Representation +Learning (UMRL) module to extract representations for each meal. Additionally, +we design a diet-aware loss function to associate food intake with weight +variations. By conducting experiments on the DietDiary dataset with two +state-of-the-art time series forecasting models, NLinear and iTransformer, we +demonstrate that our proposed framework achieves superior performance compared +to the original models. We make our dataset, code, and models publicly +available at: https://yxg1005.github.io/weight-prediction/. + +
+
+ comment: ACM MM'24 oral +
+
+
+
+
+ + ♻ ☆ ChatDiet: Empowering Personalized Nutrition-Oriented Food Recommender + Chatbots through an LLM-Augmented Framework + + +
+ The profound impact of food on health necessitates advanced +nutrition-oriented food recommendation services. Conventional methods often +lack the crucial elements of personalization, explainability, and +interactivity. While Large Language Models (LLMs) bring interpretability and +explainability, their standalone use falls short of achieving true +personalization. In this paper, we introduce ChatDiet, a novel LLM-powered +framework designed specifically for personalized nutrition-oriented food +recommendation chatbots. ChatDiet integrates personal and population models, +complemented by an orchestrator, to seamlessly retrieve and process pertinent +information. The personal model leverages causal discovery and inference +techniques to assess personalized nutritional effects for a specific user, +whereas the population model provides generalized information on food +nutritional content. The orchestrator retrieves, synergizes and delivers the +output of both models to the LLM, providing tailored food recommendations +designed to support targeted health outcomes. The result is a dynamic delivery +of personalized and explainable food recommendations, tailored to individual +user preferences. Our evaluation of ChatDiet includes a compelling case study, +where we establish a causal personal model to estimate individual nutrition +effects. Our assessments, including a food recommendation test showcasing a +92\% effectiveness rate, coupled with illustrative dialogue examples, +underscore ChatDiet's strengths in explainability, personalization, and +interactivity. + +
+
+ comment: Published on Smart Health +
+
+
+
+
+ + ♻ ☆ HybridVC: Efficient Voice Style Conversion with Text and Audio Prompts + + +
+ We introduce HybridVC, a voice conversion (VC) framework built upon a +pre-trained conditional variational autoencoder (CVAE) that combines the +strengths of a latent model with contrastive learning. HybridVC supports text +and audio prompts, enabling more flexible voice style conversion. HybridVC +models a latent distribution conditioned on speaker embeddings acquired by a +pretrained speaker encoder and optimises style text embeddings to align with +the speaker style information through contrastive learning in parallel. +Therefore, HybridVC can be efficiently trained under limited computational +resources. Our experiments demonstrate HybridVC's superior training efficiency +and its capability for advanced multi-modal voice style conversion. This +underscores its potential for widespread applications such as user-defined +personalised voice in various social media platforms. A comprehensive ablation +study further validates the effectiveness of our method. + +
+
+ comment: Proceedings of Interspeech +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Algorithmic Drift: A Simulation Framework to Study the Effects of + Recommender Systems on User Preferences + + +
+ Digital platforms such as social media and e-commerce websites adopt +Recommender Systems to provide value to the user. However, the social +consequences deriving from their adoption are still unclear. Many scholars +argue that recommenders may lead to detrimental effects, such as +bias-amplification deriving from the feedback loop between algorithmic +suggestions and users' choices. Nonetheless, the extent to which recommenders +influence changes in users leaning remains uncertain. In this context, it is +important to provide a controlled environment for evaluating the recommendation +algorithm before deployment. To address this, we propose a stochastic +simulation framework that mimics user-recommender system interactions in a +long-term scenario. In particular, we simulate the user choices by formalizing +a user model, which comprises behavioral aspects, such as the user resistance +towards the recommendation algorithm and their inertia in relying on the +received suggestions. Additionally, we introduce two novel metrics for +quantifying the algorithm's impact on user preferences, specifically in terms +of drift over time. We conduct an extensive evaluation on multiple synthetic +datasets, aiming at testing the robustness of our framework when considering +different scenarios and hyper-parameters setting. The experimental results +prove that the proposed methodology is effective in detecting and quantifying +the drift over the users preferences by means of the simulation. All the code +and data used to perform the experiments are publicly available. + +
+
+
+
+
+ + ☆ Modern Hopfield Networks meet Encoded Neural Representations -- + Addressing Practical Considerations + + +
+ Content-addressable memories such as Modern Hopfield Networks (MHN) have been +studied as mathematical models of auto-association and storage/retrieval in the +human declarative memory, yet their practical use for large-scale content +storage faces challenges. Chief among them is the occurrence of meta-stable +states, particularly when handling large amounts of high dimensional content. +This paper introduces Hopfield Encoding Networks (HEN), a framework that +integrates encoded neural representations into MHNs to improve pattern +separability and reduce meta-stable states. We show that HEN can also be used +for retrieval in the context of hetero association of images with natural +language queries, thus removing the limitation of requiring access to partial +content in the same domain. Experimental results demonstrate substantial +reduction in meta-stable states and increased storage capacity while still +enabling perfect recall of a significantly larger number of inputs advancing +the practical utility of associative memory networks for real-world tasks. + +
+
+ comment: 17 pages, 8 figures, workshop submission to Neurips +
+
+
+
+
+ + ☆ Towards Enhancing Linked Data Retrieval in Conversational UIs using + Large Language Models + + +
+ Despite the recent broad adoption of Large Language Models (LLMs) across +various domains, their potential for enriching information systems in +extracting and exploring Linked Data (LD) and Resource Description Framework +(RDF) triplestores has not been extensively explored. This paper examines the +integration of LLMs within existing systems, emphasising the enhancement of +conversational user interfaces (UIs) and their capabilities for data extraction +by producing more accurate SPARQL queries without the requirement for model +retraining. Typically, conversational UI models necessitate retraining with the +introduction of new datasets or updates, limiting their functionality as +general-purpose extraction tools. Our approach addresses this limitation by +incorporating LLMs into the conversational UI workflow, significantly enhancing +their ability to comprehend and process user queries effectively. By leveraging +the advanced natural language understanding capabilities of LLMs, our method +improves RDF entity extraction within web systems employing conventional +chatbots. This integration facilitates a more nuanced and context-aware +interaction model, critical for handling the complex query patterns often +encountered in RDF datasets and Linked Open Data (LOD) endpoints. The +evaluation of this methodology shows a marked enhancement in system +expressivity and the accuracy of responses to user queries, indicating a +promising direction for future research in this area. This investigation not +only underscores the versatility of LLMs in enhancing existing information +systems but also sets the stage for further explorations into their potential +applications within more specialised domains of web information systems. + +
+
+ comment: This paper has been accepted at the 25th International Web + Information Systems Engineering Conference (WISE 2024) +
+
+
+
+
+ + ☆ TiM4Rec: An Efficient Sequential Recommendation Model Based on + Time-Aware Structured State Space Duality Model + + +
+ Sequential recommendation represents a pivotal branch of recommendation +systems, centered around dynamically analyzing the sequential dependencies +between user preferences and their interactive behaviors. Despite the +Transformer architecture-based models achieving commendable performance within +this domain, their quadratic computational complexity relative to the sequence +dimension impedes efficient modeling. In response, the innovative Mamba +architecture, characterized by linear computational complexity, has emerged. +Mamba4Rec further pioneers the application of Mamba in sequential +recommendation. Nonetheless, Mamba 1's hardware-aware algorithm struggles to +efficiently leverage modern matrix computational units, which lead to the +proposal of the improved State Space Duality (SSD), also known as Mamba 2. +While the SSD4Rec successfully adapts the SSD architecture for sequential +recommendation, showing promising results in high-dimensional contexts, it +suffers significant performance drops in low-dimensional scenarios crucial for +pure ID sequential recommendation tasks. Addressing this challenge, we propose +a novel sequential recommendation backbone model, TiM4Rec, which ameliorates +the low-dimensional performance loss of the SSD architecture while preserving +its computational efficiency. Drawing inspiration from TiSASRec, we develop a +time-aware enhancement method tailored for the linear computation demands of +the SSD architecture, thereby enhancing its adaptability and achieving +state-of-the-art (SOTA) performance in both low and high-dimensional modeling. +The code for our model is publicly accessible at +https://github.com/AlwaysFHao/TiM4Rec. + +
+
+
+
+
+ + ☆ Seeing Faces in Things: A Model and Dataset for Pareidolia + + +
+ The human visual system is well-tuned to detect faces of all shapes and +sizes. While this brings obvious survival advantages, such as a better chance +of spotting unknown predators in the bush, it also leads to spurious face +detections. ``Face pareidolia'' describes the perception of face-like structure +among otherwise random stimuli: seeing faces in coffee stains or clouds in the +sky. In this paper, we study face pareidolia from a computer vision +perspective. We present an image dataset of ``Faces in Things'', consisting of +five thousand web images with human-annotated pareidolic faces. Using this +dataset, we examine the extent to which a state-of-the-art human face detector +exhibits pareidolia, and find a significant behavioral gap between humans and +machines. We find that the evolutionary need for humans to detect animal faces, +as well as human faces, may explain some of this gap. Finally, we propose a +simple statistical model of pareidolia in images. Through studies on human +subjects and our pareidolic face detectors we confirm a key prediction of our +model regarding what image conditions are most likely to induce pareidolia. +Dataset and Website: https://aka.ms/faces-in-things + +
+
+
+
+
+ + ☆ Exploring Hint Generation Approaches in Open-Domain Question Answering EMNLP 2024 + + +
+ Automatic Question Answering (QA) systems rely on contextual information to +provide accurate answers. Commonly, contexts are prepared through either +retrieval-based or generation-based methods. The former involves retrieving +relevant documents from a corpus like Wikipedia, whereas the latter uses +generative models such as Large Language Models (LLMs) to generate the context. +In this paper, we introduce a novel context preparation approach called HINTQA, +which employs Automatic Hint Generation (HG) techniques. Unlike traditional +methods, HINTQA prompts LLMs to produce hints about potential answers for the +question rather than generating relevant context. We evaluate our approach +across three QA datasets including TriviaQA, NaturalQuestions, and Web +Questions, examining how the number and order of hints impact performance. Our +findings show that the HINTQA surpasses both retrieval-based and +generation-based approaches. We demonstrate that hints enhance the accuracy of +answers more than retrieved and generated contexts. + +
+
+ comment: Accepted at EMNLP 2024 +
+
+
+
+
+ + ☆ SLIMER-IT: Zero-Shot NER on Italian Language + + +
+ Traditional approaches to Named Entity Recognition (NER) frame the task into +a BIO sequence labeling problem. Although these systems often excel in the +downstream task at hand, they require extensive annotated data and struggle to +generalize to out-of-distribution input domains and unseen entity types. On the +contrary, Large Language Models (LLMs) have demonstrated strong zero-shot +capabilities. While several works address Zero-Shot NER in English, little has +been done in other languages. In this paper, we define an evaluation framework +for Zero-Shot NER, applying it to the Italian language. Furthermore, we +introduce SLIMER-IT, the Italian version of SLIMER, an instruction-tuning +approach for zero-shot NER leveraging prompts enriched with definition and +guidelines. Comparisons with other state-of-the-art models, demonstrate the +superiority of SLIMER-IT on never-seen-before entity tags. + +
+
+
+
+
+ + ☆ Ducho meets Elliot: Large-scale Benchmarks for Multimodal Recommendation + + +
+ In specific domains like fashion, music, and movie recommendation, the +multi-faceted features characterizing products and services may influence each +customer on online selling platforms differently, paving the way to novel +multimodal recommendation models that can learn from such multimodal content. +According to the literature, the common multimodal recommendation pipeline +involves (i) extracting multimodal features, (ii) refining their high-level +representations to suit the recommendation task, (iii) optionally fusing all +multimodal features, and (iv) predicting the user-item score. While great +effort has been put into designing optimal solutions for (ii-iv), to the best +of our knowledge, very little attention has been devoted to exploring +procedures for (i). In this respect, the existing literature outlines the large +availability of multimodal datasets and the ever-growing number of large models +accounting for multimodal-aware tasks, but (at the same time) an unjustified +adoption of limited standardized solutions. This motivates us to explore more +extensive techniques for the (i) stage of the pipeline. To this end, this paper +settles as the first attempt to offer a large-scale benchmarking for multimodal +recommender systems, with a specific focus on multimodal extractors. +Specifically, we take advantage of two popular and recent frameworks for +multimodal feature extraction and reproducibility in recommendation, Ducho and +Elliot, to offer a unified and ready-to-use experimental environment able to +run extensive benchmarking analyses leveraging novel multimodal feature +extractors. Results, largely validated under different hyper-parameter settings +for the chosen extractors, provide important insights on how to train and tune +the next generation of multimodal recommendation algorithms. + +
+
+
+
+
+ + ☆ Mitigating Digital Discrimination in Dating Apps -- The Dutch Breeze + case + + +
+ In September 2023, the Netherlands Institute for Human Rights, the Dutch +non-discrimination authority, decided that Breeze, a Dutch dating app, was +justified in suspecting that their algorithm discriminated against non-white. +Consequently, the Institute decided that Breeze must prevent this +discrimination based on ethnicity. This paper explores two questions. (i) Is +the discrimination based on ethnicity in Breeze's matching algorithm illegal? +(ii) How can dating apps mitigate or stop discrimination in their matching +algorithms? We illustrate the legal and technical difficulties dating apps face +in tackling discrimination and illustrate promising solutions. We analyse the +Breeze decision in-depth, combining insights from computer science and law. We +discuss the implications of this judgment for scholarship and practice in the +field of fair and non-discriminatory machine learning. + +
+
+
+
+
+ + ☆ LLM-Cure: LLM-based Competitor User Review Analysis for Feature + Enhancement + + +
+ The exponential growth of the mobile app market underscores the importance of +constant innovation and rapid response to user demands. As user satisfaction is +paramount to the success of a mobile application (app), developers typically +rely on user reviews, which represent user feedback that includes ratings and +comments to identify areas for improvement. However, the sheer volume of user +reviews poses challenges in manual analysis, necessitating automated +approaches. Existing automated approaches either analyze only the target apps +reviews, neglecting the comparison of similar features to competitors or fail +to provide suggestions for feature enhancement. To address these gaps, we +propose a Large Language Model (LLM)-based Competitive User Review Analysis for +Feature Enhancement) (LLM-Cure), an approach powered by LLMs to automatically +generate suggestion s for mobile app feature improvements. More specifically, +LLM-Cure identifies and categorizes features within reviews by applying LLMs. +When provided with a complaint in a user review, LLM-Cure curates highly rated +(4 and 5 stars) reviews in competing apps related to the complaint and proposes +potential improvements tailored to the target application. We evaluate LLM-Cure +on 1,056,739 reviews of 70 popular Android apps. Our evaluation demonstrates +that LLM-Cure significantly outperforms the state-of-the-art approaches in +assigning features to reviews by up to 13% in F1-score, up to 16% in recall and +up to 11% in precision. Additionally, LLM-Cure demonstrates its capability to +provide suggestions for resolving user complaints. We verify the suggestions +using the release notes that reflect the changes of features in the target +mobile app. LLM-Cure achieves a promising average of 73% of the implementation +of the provided suggestions. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ Making Text Embedders Few-Shot Learners + + +
+ Large language models (LLMs) with decoder-only architectures demonstrate +remarkable in-context learning (ICL) capabilities. This feature enables them to +effectively handle both familiar and novel tasks by utilizing examples provided +within their input context. Recognizing the potential of this capability, we +propose leveraging the ICL feature in LLMs to enhance the process of text +embedding generation. To this end, we introduce a novel model bge-en-icl, which +employs few-shot examples to produce high-quality text embeddings. Our approach +integrates task-related examples directly into the query side, resulting in +significant improvements across various tasks. Additionally, we have +investigated how to effectively utilize LLMs as embedding models, including +various attention mechanisms, pooling methods, etc. Our findings suggest that +retaining the original framework often yields the best results, underscoring +that simplicity is best. Experimental results on the MTEB and AIR-Bench +benchmarks demonstrate that our approach sets new state-of-the-art (SOTA) +performance. Our model, code and dataset are freely available at +https://github.com/FlagOpen/FlagEmbedding . + +
+
+
+
+
+ + ☆ A Survey of Stance Detection on Social Media: New Directions and + Perspectives + + +
+ In modern digital environments, users frequently express opinions on +contentious topics, providing a wealth of information on prevailing attitudes. +The systematic analysis of these opinions offers valuable insights for +decision-making in various sectors, including marketing and politics. As a +result, stance detection has emerged as a crucial subfield within affective +computing, enabling the automatic detection of user stances in social media +conversations and providing a nuanced understanding of public sentiment on +complex issues. Recent years have seen a surge of research interest in +developing effective stance detection methods, with contributions from multiple +communities, including natural language processing, web science, and social +computing. This paper provides a comprehensive survey of stance detection +techniques on social media, covering task definitions, datasets, approaches, +and future works. We review traditional stance detection models, as well as +state-of-the-art methods based on large language models, and discuss their +strengths and limitations. Our survey highlights the importance of stance +detection in understanding public opinion and sentiment, and identifies gaps in +current research. We conclude by outlining potential future directions for +stance detection on social media, including the need for more robust and +generalizable models, and the importance of addressing emerging challenges such +as multi-modal stance detection and stance detection in low-resource languages. + +
+
+
+
+
+ + ☆ Qualitative Insights Tool (QualIT): LLM Enhanced Topic Modeling + + +
+ Topic modeling is a widely used technique for uncovering thematic structures +from large text corpora. However, most topic modeling approaches e.g. Latent +Dirichlet Allocation (LDA) struggle to capture nuanced semantics and contextual +understanding required to accurately model complex narratives. Recent +advancements in this area include methods like BERTopic, which have +demonstrated significantly improved topic coherence and thus established a new +standard for benchmarking. In this paper, we present a novel approach, the +Qualitative Insights Tool (QualIT) that integrates large language models (LLMs) +with existing clustering-based topic modeling approaches. Our method leverages +the deep contextual understanding and powerful language generation capabilities +of LLMs to enrich the topic modeling process using clustering. We evaluate our +approach on a large corpus of news articles and demonstrate substantial +improvements in topic coherence and topic diversity compared to baseline topic +modeling techniques. On the 20 ground-truth topics, our method shows 70% topic +coherence (vs 65% & 57% benchmarks) and 95.5% topic diversity (vs 85% & 72% +benchmarks). Our findings suggest that the integration of LLMs can unlock new +opportunities for topic modeling of dynamic and complex text data, as is common +in talent management research contexts. + +
+
+ comment: 6 pages, 4 tables, 1 figure +
+
+
+
+
+ + ♻ ☆ WebQuest: A Benchmark for Multimodal QA on Web Page Sequences + + +
+ The rise of powerful multimodal LLMs has enhanced the viability of building +web agents which can, with increasing levels of autonomy, assist users to +retrieve information and complete tasks on various human-computer interfaces. +It is hence necessary to build challenging benchmarks that span a wide-variety +of use cases reflecting real-world usage. In this work, we present WebQuest, a +multi-page question-answering dataset that requires reasoning across multiple +related web pages. In contrast to existing UI benchmarks that focus on +multi-step web navigation and task completion, our dataset evaluates +information extraction, multimodal retrieval and composition of information +from many web pages. WebQuest includes three question categories: single-screen +QA, multi-screen QA, and QA based on navigation traces. We evaluate leading +proprietary multimodal models like GPT-4V, Gemini Flash, Claude 3, and open +source models like InstructBLIP, PaliGemma on our dataset, revealing a +significant gap between single-screen and multi-screen reasoning. Finally, we +investigate inference time techniques like Chain-of-Thought prompting to +improve model capabilities on multi-screen reasoning. + +
+
+
+
+
+ + ♻ ☆ GLoCIM: Global-view Long Chain Interest Modeling for news recommendation + + +
+ Accurately recommending candidate news articles to users has always been the +core challenge of news recommendation system. News recommendations often +require modeling of user interest to match candidate news. Recent efforts have +primarily focused on extracting local subgraph information in a global click +graph constructed by the clicked news sequence of all users. Howerer, the +computational complexity of extracting global click graph information has +hindered the ability to utilize far-reaching linkage which is hidden between +two distant nodes in global click graph collaboratively among similar users. To +overcome the problem above, we propose a Global-view Long Chain Interests +Modeling for news recommendation (GLoCIM), which combines neighbor interest +with long chain interest distilled from a global click graph, leveraging the +collaboration among similar users to enhance news recommendation. We therefore +design a long chain selection algorithm and long chain interest encoder to +obtain global-view long chain interest from the global click graph. We design a +gated network to integrate long chain interest with neighbor interest to +achieve the collaborative interest among similar users. Subsequently we +aggregate it with local news category-enhanced representation to generate final +user representation. Then candidate news representation can be formed to match +user representation to achieve news recommendation. Experimental results on +real-world datasets validate the effectiveness of our method to improve the +performance of news recommendation. + +
+
+
+
+
+ + ♻ ☆ Fashion Image-to-Image Translation for Complementary Item Retrieval + + +
+ The increasing demand for online fashion retail has boosted research in +fashion compatibility modeling and item retrieval, focusing on matching user +queries (textual descriptions or reference images) with compatible fashion +items. A key challenge is top-bottom retrieval, where precise compatibility +modeling is essential. Traditional methods, often based on Bayesian +Personalized Ranking (BPR), have shown limited performance. Recent efforts have +explored using generative models in compatibility modeling and item retrieval, +where generated images serve as additional inputs. However, these approaches +often overlook the quality of generated images, which could be crucial for +model performance. Additionally, generative models typically require large +datasets, posing challenges when such data is scarce. + To address these issues, we introduce the Generative Compatibility Model +(GeCo), a two-stage approach that improves fashion image retrieval through +paired image-to-image translation. First, the Complementary Item Generation +Model (CIGM), built on Conditional Generative Adversarial Networks (GANs), +generates target item images (e.g., bottoms) from seed items (e.g., tops), +offering conditioning signals for retrieval. These generated samples are then +integrated into GeCo, enhancing compatibility modeling and retrieval accuracy. +Evaluations on three datasets show that GeCo outperforms state-of-the-art +baselines. Key contributions include: (i) the GeCo model utilizing paired +image-to-image translation within the Composed Image Retrieval framework, (ii) +comprehensive evaluations on benchmark datasets, and (iii) the release of a new +Fashion Taobao dataset designed for top-bottom retrieval, promoting further +research. + +
+
+
+
+
+ + ♻ ☆ Projected Gradient Descent for Spectral Compressed Sensing via Symmetric + Hankel Factorization + + +
+ Current spectral compressed sensing methods via Hankel matrix completion +employ symmetric factorization to demonstrate the low-rank property of the +Hankel matrix. However, previous non-convex gradient methods only utilize +asymmetric factorization to achieve spectral compressed sensing. In this paper, +we propose a novel nonconvex projected gradient descent method for spectral +compressed sensing via symmetric factorization named Symmetric Hankel Projected +Gradient Descent (SHGD), which updates only one matrix and avoids a balancing +regularization term. SHGD reduces about half of the computation and storage +costs compared to the prior gradient method based on asymmetric factorization. +{Besides, the symmetric factorization employed in our work is completely novel +to the prior low-rank factorization model, introducing a new factorization +ambiguity under complex orthogonal transformation}. Novel distance metrics are +designed for our factorization method and a linear convergence guarantee to the +desired signal is established with $O(r^2\log(n))$ observations. Numerical +simulations demonstrate the superior performance of the proposed SHGD method in +phase transitions and computation efficiency compared to state-of-the-art +methods. + +
+
+ comment: accepted in IEEE Transactions on Signal Processing +
+
+
+
+
+ + ♻ ☆ Recommendation Unlearning via Influence Function + + +
+ Recommendation unlearning is an emerging task to serve users for erasing +unusable data (e.g., some historical behaviors) from a well-trained recommender +model. Existing methods process unlearning requests by fully or partially +retraining the model after removing the unusable data. However, these methods +are impractical due to the high computation cost of full retraining and the +highly possible performance damage of partial training. In this light, a +desired recommendation unlearning method should obtain a similar model as full +retraining in a more efficient manner, i.e., achieving complete, efficient and +harmless unlearning. + In this work, we propose a new Influence Function-based Recommendation +Unlearning (IFRU) framework, which efficiently updates the model without +retraining by estimating the influence of the unusable data on the model via +the influence function. In the light that recent recommender models use +historical data for both the constructions of the optimization loss and the +computational graph (e.g., neighborhood aggregation), IFRU jointly estimates +the direct influence of unusable data on optimization loss and the spillover +influence on the computational graph to pursue complete unlearning. +Furthermore, we propose an importance-based pruning algorithm to reduce the +cost of the influence function. IFRU is harmless and applicable to mainstream +differentiable models. Extensive experiments demonstrate that IFRU achieves +more than 250 times acceleration compared to retraining-based methods with +recommendation performance comparable to full retraining. Codes are avaiable at +https://github.com/baiyimeng/IFRU. + +
+
+ comment: Accepted by ACM TORS +
+
+
+
+
+ + ♻ ☆ C-Pack: Packed Resources For General Chinese Embeddings SIGIR 2024 + + +
+ We introduce C-Pack, a package of resources that significantly advance the +field of general Chinese embeddings. C-Pack includes three critical resources. +1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 +tasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated +from labeled and unlabeled Chinese corpora for training embedding models. 3) +C-TEM is a family of embedding models covering multiple sizes. Our models +outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the +time of the release. We also integrate and optimize the entire suite of +training methods for C-TEM. Along with our resources on general Chinese +embedding, we release our data and models for English text embeddings. The +English models achieve state-of-the-art performance on MTEB benchmark; +meanwhile, our released English data is 2 times larger than the Chinese data. +All these resources are made publicly available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+ comment: SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Adversarial Attacks to Multi-Modal Models + + +
+ Multi-modal models have gained significant attention due to their powerful +capabilities. These models effectively align embeddings across diverse data +modalities, showcasing superior performance in downstream tasks compared to +their unimodal counterparts. Recent study showed that the attacker can +manipulate an image or audio file by altering it in such a way that its +embedding matches that of an attacker-chosen targeted input, thereby deceiving +downstream models. However, this method often underperforms due to inherent +disparities in data from different modalities. In this paper, we introduce +CrossFire, an innovative approach to attack multi-modal models. CrossFire +begins by transforming the targeted input chosen by the attacker into a format +that matches the modality of the original image or audio file. We then +formulate our attack as an optimization problem, aiming to minimize the angular +deviation between the embeddings of the transformed input and the modified +image or audio file. Solving this problem determines the perturbations to be +added to the original media. Our extensive experiments on six real-world +benchmark datasets reveal that CrossFire can significantly manipulate +downstream tasks, surpassing existing attacks. Additionally, we evaluate six +defensive strategies against CrossFire, finding that current defenses are +insufficient to counteract our CrossFire. + +
+
+ comment: To appear in the ACM Workshop on Large AI Systems and Models with + Privacy and Safety Analysis 2024 (LAMPS '24) +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ FastTalker: Jointly Generating Speech and Conversational Gestures from + Text + + +
+ Generating 3D human gestures and speech from a text script is critical for +creating realistic talking avatars. One solution is to leverage separate +pipelines for text-to-speech (TTS) and speech-to-gesture (STG), but this +approach suffers from poor alignment of speech and gestures and slow inference +times. In this paper, we introduce FastTalker, an efficient and effective +framework that simultaneously generates high-quality speech audio and 3D human +gestures at high inference speeds. Our key insight is reusing the intermediate +features from speech synthesis for gesture generation, as these features +contain more precise rhythmic information than features re-extracted from +generated speech. Specifically, 1) we propose an end-to-end framework that +concurrently generates speech waveforms and full-body gestures, using +intermediate speech features such as pitch, onset, energy, and duration +directly for gesture decoding; 2) we redesign the causal network architecture +to eliminate dependencies on future inputs for real applications; 3) we employ +Reinforcement Learning-based Neural Architecture Search (NAS) to enhance both +performance and inference speed by optimizing our network architecture. +Experimental results on the BEAT2 dataset demonstrate that FastTalker achieves +state-of-the-art performance in both speech synthesis and gesture generation, +processing speech and gestures in 0.17 seconds per second on an NVIDIA 3090. + +
+
+ comment: European Conference on Computer Vision Workshop +
+
+
+
+
+ + ☆ HA-FGOVD: Highlighting Fine-grained Attributes via Explicit Linear + Composition for Open-Vocabulary Object Detection + + +
+ Open-vocabulary object detection (OVD) models are considered to be Large +Multi-modal Models (LMM), due to their extensive training data and a large +number of parameters. Mainstream OVD models prioritize object coarse-grained +category rather than focus on their fine-grained attributes, e.g., colors or +materials, thus failed to identify objects specified with certain attributes. +However, OVD models are pretrained on large-scale image-text pairs with rich +attribute words, whose latent feature space can represent the global text +feature as a linear composition of fine-grained attribute tokens without +highlighting them. Therefore, we propose in this paper a universal and explicit +approach for frozen mainstream OVD models that boosts their attribute-level +detection capabilities by highlighting fine-grained attributes in explicit +linear space. Firstly, a LLM is leveraged to highlight attribute words within +the input text as a zero-shot prompted task. Secondly, by strategically +adjusting the token masks, the text encoders of OVD models extract both global +text and attribute-specific features, which are then explicitly composited as +two vectors in linear space to form the new attribute-highlighted feature for +detection tasks, where corresponding scalars are hand-crafted or learned to +reweight both two vectors. Notably, these scalars can be seamlessly transferred +among different OVD models, which proves that such an explicit linear +composition is universal. Empirical evaluation on the FG-OVD dataset +demonstrates that our proposed method uniformly improves fine-grained +attribute-level OVD of various mainstream models and achieves new +state-of-the-art performance. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Layer-wise Model Merging for Unsupervised Domain Adaptation in + Segmentation Tasks + + +
+ Merging parameters of multiple models has resurfaced as an effective strategy +to enhance task performance and robustness, but prior work is limited by the +high costs of ensemble creation and inference. In this paper, we leverage the +abundance of freely accessible trained models to introduce a cost-free approach +to model merging. It focuses on a layer-wise integration of merged models, +aiming to maintain the distinctiveness of the task-specific final layers while +unifying the initial layers, which are primarily associated with feature +extraction. This approach ensures parameter consistency across all layers, +essential for boosting performance. Moreover, it facilitates seamless +integration of knowledge, enabling effective merging of models from different +datasets and tasks. Specifically, we investigate its applicability in +Unsupervised Domain Adaptation (UDA), an unexplored area for model merging, for +Semantic and Panoptic Segmentation. Experimental results demonstrate +substantial UDA improvements without additional costs for merging +same-architecture models from distinct datasets ($\uparrow 2.6\%$ mIoU) and +different-architecture models with a shared backbone ($\uparrow 6.8\%$ mIoU). +Furthermore, merging Semantic and Panoptic Segmentation models increases mPQ by +$\uparrow 7\%$. These findings are validated across a wide variety of UDA +strategies, architectures, and datasets. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Optimizing News Text Classification with Bi-LSTM and Attention Mechanism + for Efficient Data Processing + + +
+ The development of Internet technology has led to a rapid increase in news +information. Filtering out valuable content from complex information has become +an urgentproblem that needs to be solved. In view of the shortcomings of +traditional manual classification methods that are time-consuming and +inefficient, this paper proposes an automaticclassification scheme for news +texts based on deep learning. This solution achieves efficient classification +and management of news texts by introducing advanced machine learning +algorithms, especially an optimization model that combines Bi-directional Long +Short-Term Memory Network (Bi-LSTM) and Attention Mechanism. Experimental +results show that this solution can not only significantly improve the accuracy +and timeliness of classification, but also significantly reduce the need for +manual intervention. It has important practical significance for improving the +information processing capabilities of the news industry and accelerating the +speed of information flow. Through comparative analysis of multiple common +models, the effectiveness and advancement of the proposed method are proved, +laying a solid foundation for future news text classification research. + +
+
+
+
+
+ + ☆ Cross-Domain Latent Factors Sharing via Implicit Matrix Factorization + + +
+ Data sparsity has been one of the long-standing problems for recommender +systems. One of the solutions to mitigate this issue is to exploit knowledge +available in other source domains. However, many cross-domain recommender +systems introduce a complex architecture that makes them less scalable in +practice. On the other hand, matrix factorization methods are still considered +to be strong baselines for single-domain recommendations. In this paper, we +introduce the CDIMF, a model that extends the standard implicit matrix +factorization with ALS to cross-domain scenarios. We apply the Alternating +Direction Method of Multipliers to learn shared latent factors for overlapped +users while factorizing the interaction matrix. In a dual-domain setting, +experiments on industrial datasets demonstrate a competing performance of CDIMF +for both cold-start and warm-start. The proposed model can outperform most +other recent cross-domain and single-domain models. We also provide the code to +reproduce experiments on GitHub. + +
+
+
+
+
+ + ☆ Stalactite: Toolbox for Fast Prototyping of Vertical Federated Learning + Systems + + +
+ Machine learning (ML) models trained on datasets owned by different +organizations and physically located in remote databases offer benefits in many +real-world use cases. State regulations or business requirements often prevent +data transfer to a central location, making it difficult to utilize standard +machine learning algorithms. Federated Learning (FL) is a technique that +enables models to learn from distributed datasets without revealing the +original data. Vertical Federated learning (VFL) is a type of FL where data +samples are divided by features across several data owners. For instance, in a +recommendation task, a user can interact with various sets of items, and the +logs of these interactions are stored by different organizations. In this demo +paper, we present \emph{Stalactite} - an open-source framework for VFL that +provides the necessary functionality for building prototypes of VFL systems. It +has several advantages over the existing frameworks. In particular, it allows +researchers to focus on the algorithmic side rather than engineering and to +easily deploy learning in a distributed environment. It implements several VFL +algorithms and has a built-in homomorphic encryption layer. We demonstrate its +use on a real-world recommendation datasets. + +
+
+
+
+
+ + ☆ Generative AI Is Not Ready for Clinical Use in Patient Education for + Lower Back Pain Patients, Even With Retrieval-Augmented Generation + + +
+ Low back pain (LBP) is a leading cause of disability globally. Following the +onset of LBP and subsequent treatment, adequate patient education is crucial +for improving functionality and long-term outcomes. Despite advancements in +patient education strategies, significant gaps persist in delivering +personalized, evidence-based information to patients with LBP. Recent +advancements in large language models (LLMs) and generative artificial +intelligence (GenAI) have demonstrated the potential to enhance patient +education. However, their application and efficacy in delivering educational +content to patients with LBP remain underexplored and warrant further +investigation. In this study, we introduce a novel approach utilizing LLMs with +Retrieval-Augmented Generation (RAG) and few-shot learning to generate tailored +educational materials for patients with LBP. Physical therapists manually +evaluated our model responses for redundancy, accuracy, and completeness using +a Likert scale. In addition, the readability of the generated education +materials is assessed using the Flesch Reading Ease score. The findings +demonstrate that RAG-based LLMs outperform traditional LLMs, providing more +accurate, complete, and readable patient education materials with less +redundancy. Having said that, our analysis reveals that the generated materials +are not yet ready for use in clinical practice. This study underscores the +potential of AI-driven models utilizing RAG to improve patient education for +LBP; however, significant challenges remain in ensuring the clinical relevance +and granularity of content generated by these models. + +
+
+
+
+
+ + ☆ Lessons Learned on Information Retrieval in Electronic Health Records: A + Comparison of Embedding Models and Pooling Strategies + + +
+ Objective: Applying large language models (LLMs) to the clinical domain is +challenging due to the context-heavy nature of processing medical records. +Retrieval-augmented generation (RAG) offers a solution by facilitating +reasoning over large text sources. However, there are many parameters to +optimize in just the retrieval system alone. This paper presents an ablation +study exploring how different embedding models and pooling methods affect +information retrieval for the clinical domain. + Methods: Evaluating on three retrieval tasks on two electronic health record +(EHR) data sources, we compared seven models, including medical- and +general-domain models, specialized encoder embedding models, and off-the-shelf +decoder LLMs. We also examine the choice of embedding pooling strategy for each +model, independently on the query and the text to retrieve. + Results: We found that the choice of embedding model significantly impacts +retrieval performance, with BGE, a comparatively small general-domain model, +consistently outperforming all others, including medical-specific models. +However, our findings also revealed substantial variability across datasets and +query text phrasings. We also determined the best pooling methods for each of +these models to guide future design of retrieval systems. + Discussion: The choice of embedding model, pooling strategy, and query +formulation can significantly impact retrieval performance and the performance +of these models on other public benchmarks does not necessarily transfer to new +domains. Further studies such as this one are vital for guiding +empirically-grounded development of retrieval frameworks, such as in the +context of RAG, for the clinical domain. + +
+
+
+
+
+ + ☆ Don't Use LLMs to Make Relevance Judgments + + +
+ Making the relevance judgments for a TREC-style test collection can be +complex and expensive. A typical TREC track usually involves a team of six +contractors working for 2-4 weeks. Those contractors need to be trained and +monitored. Software has to be written to support recording relevance judgments +correctly and efficiently. The recent advent of large language models that +produce astoundingly human-like flowing text output in response to a natural +language prompt has inspired IR researchers to wonder how those models might be +used in the relevance judgment collection process. At the ACM SIGIR 2024 +conference, a workshop ``LLM4Eval'' provided a venue for this work, and +featured a data challenge activity where participants reproduced TREC deep +learning track judgments, as was done by Thomas et al (arXiv:2408.08896, +arXiv:2309.10621). I was asked to give a keynote at the workshop, and this +paper presents that keynote in article form. The bottom-line-up-front message +is, don't use LLMs to create relevance judgments for TREC-style evaluations. + +
+
+
+
+
+ + ☆ EMERS: Energy Meter for Recommender Systems + + +
+ Due to recent advancements in machine learning, recommender systems use +increasingly more energy for training, evaluation, and deployment. However, the +recommender systems community often does not report the energy consumption of +their experiments. In today's research landscape, no tools exist to easily +measure the energy consumption of recommender systems experiments. To bridge +this gap, we introduce EMERS, the first software library that simplifies +measuring, monitoring, recording, and sharing the energy consumption of +recommender systems experiments. EMERS measures energy consumption with smart +power plugs and offers a user interface to monitor and compare the energy +consumption of recommender systems experiments. Thereby, EMERS improves +sustainability awareness and simplifies self-reporting energy consumption for +recommender systems practitioners and researchers. + +
+
+ comment: Accepted at the RecSoGood 2024 Workshop co-located with the 18th ACM + Conference on Recommender Systems +
+
+
+
+
+ + ☆ ViBERTgrid BiLSTM-CRF: Multimodal Key Information Extraction from + Unstructured Financial Documents ECML + + +
+ Multimodal key information extraction (KIE) models have been studied +extensively on semi-structured documents. However, their investigation on +unstructured documents is an emerging research topic. The paper presents an +approach to adapt a multimodal transformer (i.e., ViBERTgrid previously +explored on semi-structured documents) for unstructured financial documents, by +incorporating a BiLSTM-CRF layer. The proposed ViBERTgrid BiLSTM-CRF model +demonstrates a significant improvement in performance (up to 2 percentage +points) on named entity recognition from unstructured documents in financial +domain, while maintaining its KIE performance on semi-structured documents. As +an additional contribution, we publicly released token-level annotations for +the SROIE dataset in order to pave the way for its use in multimodal sequence +labeling models. + +
+
+ comment: Accepted in MIDAS (The 8th Workshop on MIning DAta for financial + applicationS) workshop of ECML PKDD 2023 conference +
+
+
+
+
+ + ☆ Adaptive Learning on User Segmentation: Universal to Specific + Representation via Bipartite Neural Interaction + + +
+ Recently, models for user representation learning have been widely applied in +click-through-rate (CTR) and conversion-rate (CVR) prediction. Usually, the +model learns a universal user representation as the input for subsequent +scenario-specific models. However, in numerous industrial applications (e.g., +recommendation and marketing), the business always operates such applications +as various online activities among different user segmentation. These +segmentation are always created by domain experts. Due to the difference in +user distribution (i.e., user segmentation) and business objectives in +subsequent tasks, learning solely on universal representation may lead to +detrimental effects on both model performance and robustness. In this paper, we +propose a novel learning framework that can first learn general universal user +representation through information bottleneck. Then, merge and learn a +segmentation-specific or a task-specific representation through neural +interaction. We design the interactive learning process by leveraging a +bipartite graph architecture to model the representation learning and merging +between contextual clusters and each user segmentation. Our proposed method is +evaluated in two open-source benchmarks, two offline business datasets, and +deployed on two online marketing applications to predict users' CVR. The +results demonstrate that our method can achieve superior performance and +surpass the baseline methods. + +
+
+
+
+
+ + ☆ FedSlate:A Federated Deep Reinforcement Learning Recommender System + + +
+ Reinforcement learning methods have been used to optimize long-term user +engagement in recommendation systems. However, existing reinforcement +learning-based recommendation systems do not fully exploit the relevance of +individual user behavior across different platforms. One potential solution is +to aggregate data from various platforms in a centralized location and use the +aggregated data for training. However, this approach raises economic and legal +concerns, including increased communication costs and potential threats to user +privacy. To address these challenges, we propose \textbf{FedSlate}, a federated +reinforcement learning recommendation algorithm that effectively utilizes +information that is prohibited from being shared at a legal level. We employ +the SlateQ algorithm to assist FedSlate in learning users' long-term behavior +and evaluating the value of recommended content. We extend the existing +application scope of recommendation systems from single-user single-platform to +single-user multi-platform and address cross-platform learning challenges by +introducing federated learning. We use RecSim to construct a simulation +environment for evaluating FedSlate and compare its performance with +state-of-the-art benchmark recommendation models. Experimental results +demonstrate the superior effects of FedSlate over baseline methods in various +environmental settings, and FedSlate facilitates the learning of recommendation +strategies in scenarios where baseline methods are completely inapplicable. +Code is available at \textit{https://github.com/TianYaDY/FedSlate}. + +
+
+
+
+
+ + ☆ Pre-trained Language Model and Knowledge Distillation for Lightweight + Sequential Recommendation + + +
+ Sequential recommendation models user interests based on historical behaviors +to provide personalized recommendation. Previous sequential recommendation +algorithms primarily employ neural networks to extract features of user +interests, achieving good performance. However, due to the recommendation +system datasets sparsity, these algorithms often employ small-scale network +frameworks, resulting in weaker generalization capability. Recently, a series +of sequential recommendation algorithms based on large pre-trained language +models have been proposed. Nonetheless, given the real-time demands of +recommendation systems, the challenge remains in applying pre-trained language +models for rapid recommendations in real scenarios. To address this, we propose +a sequential recommendation algorithm based on a pre-trained language model and +knowledge distillation. The key of proposed algorithm is to transfer +pre-trained knowledge across domains and achieve lightweight inference by +knowledge distillation. The algorithm operates in two stages: in the first +stage, we fine-tune the pre-trained language model on the recommendation +dataset to transfer the pre-trained knowledge to the recommendation task; in +the second stage, we distill the trained language model to transfer the learned +knowledge to a lightweight model. Extensive experiments on multiple public +recommendation datasets show that the proposed algorithm enhances +recommendation accuracy and provide timely recommendation services. + +
+
+ comment: in Chinese language +
+
+
+
+
+ + ☆ EDGE-Rec: Efficient and Data-Guided Edge Diffusion For Recommender + Systems Graphs + + +
+ Most recommender systems research focuses on binary historical user-item +interaction encodings to predict future interactions. User features, item +features, and interaction strengths remain largely under-utilized in this space +or only indirectly utilized, despite proving largely effective in large-scale +production recommendation systems. We propose a new attention mechanism, +loosely based on the principles of collaborative filtering, called Row-Column +Separable Attention RCSA to take advantage of real-valued interaction weights +as well as user and item features directly. Building on this mechanism, we +additionally propose a novel Graph Diffusion Transformer GDiT architecture +which is trained to iteratively denoise the weighted interaction matrix of the +user-item interaction graph directly. The weighted interaction matrix is built +from the bipartite structure of the user-item interaction graph and +corresponding edge weights derived from user-item rating interactions. Inspired +by the recent progress in text-conditioned image generation, our method +directly produces user-item rating predictions on the same scale as the +original ratings by conditioning the denoising process on user and item +features with a principled approach. + +
+
+ comment: 6 pages, 13 figures +
+
+
+
+
+ + ☆ Reducing the Footprint of Multi-Vector Retrieval with Minimal + Performance Impact via Token Pooling + + +
+ Over the last few years, multi-vector retrieval methods, spearheaded by +ColBERT, have become an increasingly popular approach to Neural IR. By storing +representations at the token level rather than at the document level, these +methods have demonstrated very strong retrieval performance, especially in +out-of-domain settings. However, the storage and memory requirements necessary +to store the large number of associated vectors remain an important drawback, +hindering practical adoption. In this paper, we introduce a simple +clustering-based token pooling approach to aggressively reduce the number of +vectors that need to be stored. This method can reduce the space & memory +footprint of ColBERT indexes by 50% with virtually no retrieval performance +degradation. This method also allows for further reductions, reducing the +vector count by 66%-to-75% , with degradation remaining below 5% on a vast +majority of datasets. Importantly, this approach requires no architectural +change nor query-time processing, and can be used as a simple drop-in during +indexation with any ColBERT-like model. + +
+
+
+
+
+ + ☆ Robust Training Objectives Improve Embedding-based Retrieval in + Industrial Recommendation Systems RecSys + + +
+ Improving recommendation systems (RS) can greatly enhance the user experience +across many domains, such as social media. Many RS utilize embedding-based +retrieval (EBR) approaches to retrieve candidates for recommendation. In an EBR +system, the embedding quality is key. According to recent literature, +self-supervised multitask learning (SSMTL) has showed strong performance on +academic benchmarks in embedding learning and resulted in an overall +improvement in multiple downstream tasks, demonstrating a larger resilience to +the adverse conditions between each downstream task and thereby increased +robustness and task generalization ability through the training objective. +However, whether or not the success of SSMTL in academia as a robust training +objectives translates to large-scale (i.e., over hundreds of million users and +interactions in-between) industrial RS still requires verification. Simply +adopting academic setups in industrial RS might entail two issues. Firstly, +many self-supervised objectives require data augmentations (e.g., embedding +masking/corruption) over a large portion of users and items, which is +prohibitively expensive in industrial RS. Furthermore, some self-supervised +objectives might not align with the recommendation task, which might lead to +redundant computational overheads or negative transfer. In light of these two +challenges, we evaluate using a robust training objective, specifically SSMTL, +through a large-scale friend recommendation system on a social media platform +in the tech sector, identifying whether this increase in robustness can work at +scale in enhancing retrieval in the production setting. Through online A/B +testing with SSMTL-based EBR, we observe statistically significant increases in +key metrics in the friend recommendations, with up to 5.45% improvements in new +friends made and 1.91% improvements in new friends made with cold-start users. + +
+
+ comment: RobustRecSys workshop @ RecSys 2024 +
+
+
+
+
+ + ♻ ☆ GNNAnatomy: Systematic Generation and Evaluation of Multi-Level + Explanations for Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) excel in machine learning tasks involving +graphs, such as node classification, graph classification, and link prediction. +However, explaining their decision-making process is challenging due to the +complex transformations GNNs perform by aggregating relational information from +graph topology. Existing methods for explaining GNNs face key limitations: (1) +lack of flexibility in generating explanations at varying levels, (2) +difficulty in identifying unique substructures relevant to class +differentiation, and (3) little support to ensure the trustworthiness of +explanations. To address these challenges, we introduce GNNAnatomy, a visual +analytics system designed to generate and evaluate multi-level GNN explanations +for graph classification tasks. GNNAnatomy uses graphlets, primitive graph +substructures, to identify the most critical substructures in a graph class by +analyzing the correlation between GNN predictions and graphlet frequencies. +These correlations are presented interactively for user-selected group of +graphs through our visual analytics system. To further validate top-ranked +graphlets, we measure the change in classification confidence after removing +each graphlet from the original graph. We demonstrate the effectiveness of +GNNAnatomy through case studies on synthetic and real-world graph datasets from +sociology and biology domains. Additionally, we compare GNNAnatomy with +state-of-the-art explainable GNN methods to showcase its utility and +versatility. + +
+
+
+
+
+
+
+
+ + Multimedia 10 + +
+
+
+ + ☆ Revise, Reason, and Recognize: LLM-Based Emotion Recognition via + Emotion-Specific Prompts and ASR Error Correction + + +
+ Annotating and recognizing speech emotion using prompt engineering has +recently emerged with the advancement of Large Language Models (LLMs), yet its +efficacy and reliability remain questionable. In this paper, we conduct a +systematic study on this topic, beginning with the proposal of novel prompts +that incorporate emotion-specific knowledge from acoustics, linguistics, and +psychology. Subsequently, we examine the effectiveness of LLM-based prompting +on Automatic Speech Recognition (ASR) transcription, contrasting it with +ground-truth transcription. Furthermore, we propose a Revise-Reason-Recognize +prompting pipeline for robust LLM-based emotion recognition from spoken +language with ASR errors. Additionally, experiments on context-aware learning, +in-context learning, and instruction tuning are performed to examine the +usefulness of LLM training schemes in this direction. Finally, we investigate +the sensitivity of LLMs to minor prompt variations. Experimental results +demonstrate the efficacy of the emotion-specific prompts, ASR error correction, +and LLM training schemes for LLM-based emotion recognition. Our study aims to +refine the use of LLMs in emotion recognition and related domains. + +
+
+
+
+
+ + ☆ Rethinking Emotion Bias in Music via Frechet Audio Distance + + +
+ The subjective nature of music emotion introduces inherent bias in both +recognition and generation, especially when relying on a single audio encoder, +emotion classifier, or evaluation metric. In this work, we conduct a study on +Music Emotion Recognition (MER) and Emotional Music Generation (EMG), employing +diverse audio encoders alongside the Frechet Audio Distance (FAD), a +reference-free evaluation metric. Our study begins with a benchmark evaluation +of MER, highlighting the limitations associated with using a single audio +encoder and the disparities observed across different measurements. We then +propose assessing MER performance using FAD from multiple encoders to provide a +more objective measure of music emotion. Furthermore, we introduce an enhanced +EMG approach designed to improve both the variation and prominence of generated +music emotion, thus enhancing realism. Additionally, we investigate the realism +disparities between the emotions conveyed in real and synthetic music, +comparing our EMG model against two baseline models. Experimental results +underscore the emotion bias problem in both MER and EMG and demonstrate the +potential of using FAD and diverse audio encoders to evaluate music emotion +objectively. + +
+
+
+
+
+ + ☆ LoVA: Long-form Video-to-Audio Generation ICASSP 2025 + + +
+ Video-to-audio (V2A) generation is important for video editing and +post-processing, enabling the creation of semantics-aligned audio for silent +video. However, most existing methods focus on generating short-form audio for +short video segment (less than 10 seconds), while giving little attention to +the scenario of long-form video inputs. For current UNet-based diffusion V2A +models, an inevitable problem when handling long-form audio generation is the +inconsistencies within the final concatenated audio. In this paper, we first +highlight the importance of long-form V2A problem. Besides, we propose LoVA, a +novel model for Long-form Video-to-Audio generation. Based on the Diffusion +Transformer (DiT) architecture, LoVA proves to be more effective at generating +long-form audio compared to existing autoregressive models and UNet-based +diffusion models. Extensive objective and subjective experiments demonstrate +that LoVA achieves comparable performance on 10-second V2A benchmark and +outperforms all other baselines on a benchmark with long-form video input. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ DanceCamAnimator: Keyframe-Based Controllable 3D Dance Camera Synthesis + + +
+ Synthesizing camera movements from music and dance is highly challenging due +to the contradicting requirements and complexities of dance cinematography. +Unlike human movements, which are always continuous, dance camera movements +involve both continuous sequences of variable lengths and sudden drastic +changes to simulate the switching of multiple cameras. However, in previous +works, every camera frame is equally treated and this causes jittering and +unavoidable smoothing in post-processing. To solve these problems, we propose +to integrate animator dance cinematography knowledge by formulating this task +as a three-stage process: keyframe detection, keyframe synthesis, and tween +function prediction. Following this formulation, we design a novel end-to-end +dance camera synthesis framework \textbf{DanceCamAnimator}, which imitates +human animation procedures and shows powerful keyframe-based controllability +with variable lengths. Extensive experiments on the DCM dataset demonstrate +that our method surpasses previous baselines quantitatively and qualitatively. +Code will be available at +\url{https://github.com/Carmenw1203/DanceCamAnimator-Official}. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ☆ RoWSFormer: A Robust Watermarking Framework with Swin Transformer for + Enhanced Geometric Attack Resilience + + +
+ In recent years, digital watermarking techniques based on deep learning have +been widely studied. To achieve both imperceptibility and robustness of image +watermarks, most current methods employ convolutional neural networks to build +robust watermarking frameworks. However, despite the success of CNN-based +watermarking models, they struggle to achieve robustness against geometric +attacks due to the limitations of convolutional neural networks in capturing +global and long-range relationships. To address this limitation, we propose a +robust watermarking framework based on the Swin Transformer, named RoWSFormer. +Specifically, we design the Locally-Channel Enhanced Swin Transformer Block as +the core of both the encoder and decoder. This block utilizes the +self-attention mechanism to capture global and long-range information, thereby +significantly improving adaptation to geometric distortions. Additionally, we +construct the Frequency-Enhanced Transformer Block to extract frequency domain +information, which further strengthens the robustness of the watermarking +framework. Experimental results demonstrate that our RoWSFormer surpasses +existing state-of-the-art watermarking methods. For most non-geometric attacks, +RoWSFormer improves the PSNR by 3 dB while maintaining the same extraction +accuracy. In the case of geometric attacks (such as rotation, scaling, and +affine transformations), RoWSFormer achieves over a 6 dB improvement in PSNR, +with extraction accuracy exceeding 97\%. + +
+
+
+
+
+ + ☆ AIM 2024 Challenge on Video Saliency Prediction: Methods and Results ECCV + + +
+ This paper reviews the Challenge on Video Saliency Prediction at AIM 2024. +The goal of the participants was to develop a method for predicting accurate +saliency maps for the provided set of video sequences. Saliency maps are widely +exploited in various applications, including video compression, quality +assessment, visual perception studies, the advertising industry, etc. For this +competition, a previously unused large-scale audio-visual mouse saliency +(AViMoS) dataset of 1500 videos with more than 70 observers per video was +collected using crowdsourced mouse tracking. The dataset collection methodology +has been validated using conventional eye-tracking data and has shown high +consistency. Over 30 teams registered in the challenge, and there are 7 teams +that submitted the results in the final phase. The final phase solutions were +tested and ranked by commonly used quality metrics on a private test subset. +The results of this evaluation and the descriptions of the solutions are +presented in this report. All data, including the private test subset, is made +publicly available on the challenge homepage - +https://challenges.videoprocessing.ai/challenges/video-saliency-prediction.html. + +
+
+ comment: ECCVW 2024 +
+
+
+
+
+ + ☆ A Multimedia Framework for Continuum Robots: Systematic, Computational, + and Control Perspectives + + +
+ Continuum robots, which often rely on interdisciplinary and multimedia +collaborations, have been increasingly recognized for their potential to +revolutionize the field of human-robot interaction (HRI) in varied applications +due to their adaptive, responsive, and flexible characteristics. Despite their +promises, the lack of an integrated framework poses significant challenges for +both users and developers, resulting in inefficiency and complexity during +preliminary developments. Thus, this paper introduces a unified framework for +bionic robotics that addresses these challenges by integrating system +architecture, dynamics computation, and control strategy. The proposed method +allows for efficient modeling and quick preview of the results in both digital +and physical environments, which can enhance the quality of robot developments. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ☆ MemeCLIP: Leveraging CLIP Representations for Multimodal Meme + Classification EMNLP 2024 + + +
+ The complexity of text-embedded images presents a formidable challenge in +machine learning given the need for multimodal understanding of the multiple +aspects of expression conveyed in them. While previous research in multimodal +analysis has primarily focused on singular aspects such as hate speech and its +subclasses, our study expands the focus to encompass multiple aspects of +linguistics: hate, target, stance, and humor detection. We introduce a novel +dataset PrideMM comprising text-embedded images associated with the LGBTQ+ +Pride movement, thereby addressing a serious gap in existing resources. We +conduct extensive experimentation on PrideMM by using unimodal and multimodal +baseline methods to establish benchmarks for each task. Additionally, we +propose a novel framework MemeCLIP for efficient downstream learning while +preserving the knowledge of the pre-trained CLIP model. The results of our +experiments show that MemeCLIP achieves superior performance compared to +previously proposed frameworks on two real-world datasets. We further compare +the performance of MemeCLIP and zero-shot GPT-4 on the hate classification +task. Finally, we discuss the shortcomings of our model by qualitatively +analyzing misclassified samples. Our code and dataset are publicly available +at: https://github.com/SiddhantBikram/MemeCLIP. + +
+
+ comment: Accepted to EMNLP 2024 (Main) +
+
+
+
+
+ + ♻ ☆ Feeding the Crave: How People with Eating Disorders Get Trapped in the + Perpetual Cycle of Digital Food Content + + +
+ Recent studies have examined how digital food content impacts viewers' +dietary health. A few have found that individuals with eating disorders are +particularly sensitive to digital food content, such as eating and cooking +videos, which contribute to disordered eating behaviors. However, there is a +lack of comprehensive studies that investigate how these individuals interact +with various digital food content. To fill this gap, we conducted two rounds of +studies (N=23 and 22, respectively) with individuals with eating disorders to +understand their motivations and practices of consuming digital food content. +Our study reveals that participants anticipate positive effects from food media +to overcome their condition, but in practice, it often exacerbates their +disorder. We also discovered that many participants experienced a cycle of +quitting and returning to digital food content consumption. Based on these +findings, we articulate design implications for digital food content and +multimedia platforms to support vulnerable individuals. + +
+
+ comment: 25 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge + + +
+ With the advancements in singing voice generation and the growing presence of +AI singers on media platforms, the inaugural Singing Voice Deepfake Detection +(SVDD) Challenge aims to advance research in identifying AI-generated singing +voices from authentic singers. This challenge features two tracks: a controlled +setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The +CtrSVDD track utilizes publicly available singing vocal data to generate +deepfakes using state-of-the-art singing voice synthesis and conversion +systems. Meanwhile, the WildSVDD track expands upon the existing SingFake +dataset, which includes data sourced from popular user-generated content +websites. For the CtrSVDD track, we received submissions from 47 teams, with 37 +surpassing our baselines and the top team achieving a 1.65% equal error rate. +For the WildSVDD track, we benchmarked the baselines. This paper reviews these +results, discusses key findings, and outlines future directions for SVDD +research. + +
+
+ comment: 6 pages, Accepted by 2024 IEEE Spoken Language Technology Workshop + (SLT 2024) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ Nirjas: An open source framework for extracting metadata from the source + code + + +
+ Metadata and comments are critical elements of any software development +process. In this paper, we explain how metadata and comments in source code can +play an essential role in comprehending software. We introduce a Python-based +open-source framework, Nirjas, which helps in extracting this metadata in a +structured manner. Various syntaxes, types, and widely accepted conventions +exist for adding comments in source files of different programming languages. +Edge cases can create noise in extraction, for which we use Regex to accurately +retrieve metadata. Non-Regex methods can give results but often miss accuracy +and noise separation. Nirjas also separates different types of comments, source +code, and provides details about those comments, such as line number, file +name, language used, total SLOC, etc. Nirjas is a standalone Python +framework/library and can be easily installed via source or pip (the Python +package installer). Nirjas was initially created as part of a Google Summer of +Code project and is currently developed and maintained under the FOSSology +organization. + +
+
+ comment: 2022 12th International Conference on Cloud Computing, Data Science & + Engineering (Confluence) +
+
+
+
+
+ + ☆ Beyond Words: Evaluating Large Language Models in Transportation + Planning + + +
+ The resurgence and rapid advancement of Generative Artificial Intelligence +(GenAI) in 2023 has catalyzed transformative shifts across numerous industry +sectors, including urban transportation and logistics. This study investigates +the evaluation of Large Language Models (LLMs), specifically GPT-4 and +Phi-3-mini, to enhance transportation planning. The study assesses the +performance and spatial comprehension of these models through a +transportation-informed evaluation framework that includes general geospatial +skills, general transportation domain skills, and real-world transportation +problem-solving. Utilizing a mixed-methods approach, the research encompasses +an evaluation of the LLMs' general Geographic Information System (GIS) skills, +general transportation domain knowledge as well as abilities to support human +decision-making in the real-world transportation planning scenarios of +congestion pricing. Results indicate that GPT-4 demonstrates superior accuracy +and reliability across various GIS and transportation-specific tasks compared +to Phi-3-mini, highlighting its potential as a robust tool for transportation +planners. Nonetheless, Phi-3-mini exhibits competence in specific analytical +scenarios, suggesting its utility in resource-constrained environments. The +findings underscore the transformative potential of GenAI technologies in urban +transportation planning. Future work could explore the application of newer +LLMs and the impact of Retrieval-Augmented Generation (RAG) techniques, on a +broader set of real-world transportation planning and operations challenges, to +deepen the integration of advanced AI models in transportation management +practices. + +
+
+
+
+
+ + ♻ ☆ XRec: Large Language Models for Explainable Recommendation EMNLP 2024 + + +
+ Recommender systems help users navigate information overload by providing +personalized recommendations aligned with their preferences. Collaborative +Filtering (CF) is a widely adopted approach, but while advanced techniques like +graph neural networks (GNNs) and self-supervised learning (SSL) have enhanced +CF models for better user representations, they often lack the ability to +provide explanations for the recommended items. Explainable recommendations aim +to address this gap by offering transparency and insights into the +recommendation decision-making process, enhancing users' understanding. This +work leverages the language capabilities of Large Language Models (LLMs) to +push the boundaries of explainable recommender systems. We introduce a +model-agnostic framework called XRec, which enables LLMs to provide +comprehensive explanations for user behaviors in recommender systems. By +integrating collaborative signals and designing a lightweight collaborative +adaptor, the framework empowers LLMs to understand complex patterns in +user-item interactions and gain a deeper understanding of user preferences. Our +extensive experiments demonstrate the effectiveness of XRec, showcasing its +ability to generate comprehensive and meaningful explanations that outperform +baseline approaches in explainable recommender systems. We open-source our +model implementation at https://github.com/HKUDS/XRec. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Analyzing the Effectiveness of Listwise Reranking with Positional + Invariance on Temporal Generalizability + + +
+ This working note outlines our participation in the retrieval task at CLEF +2024. We highlight the considerable gap between studying retrieval performance +on static knowledge documents and understanding performance in real-world +environments. Therefore, Addressing these discrepancies and measuring the +temporal persistence of IR systems is crucial. By investigating the LongEval +benchmark, specifically designed for such dynamic environments, our findings +demonstrate the effectiveness of a listwise reranking approach, which +proficiently handles inaccuracies induced by temporal distribution shifts. +Among listwise rerankers, our findings show that ListT5, which effectively +mitigates the positional bias problem by adopting the Fusion-in-Decoder +architecture, is especially effective, and more so, as temporal drift +increases, on the test-long subset. + +
+
+ comment: Accepted at CLEF 2024 LongEval track +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Self-Supervised Audio-Visual Soundscape Stylization ECCV 2024 + + +
+ Speech sounds convey a great deal of information about the scenes, resulting +in a variety of effects ranging from reverberation to additional ambient +sounds. In this paper, we manipulate input speech to sound as though it was +recorded within a different scene, given an audio-visual conditional example +recorded from that scene. Our model learns through self-supervision, taking +advantage of the fact that natural video contains recurring sound events and +textures. We extract an audio clip from a video and apply speech enhancement. +We then train a latent diffusion model to recover the original speech, using +another audio-visual clip taken from elsewhere in the video as a conditional +hint. Through this process, the model learns to transfer the conditional +example's sound properties to the input speech. We show that our model can be +successfully trained using unlabeled, in-the-wild videos, and that an +additional visual signal can improve its sound prediction abilities. Please see +our project webpage for video results: +https://tinglok.netlify.app/files/avsoundscape/ + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Scene-Text Grounding for Text-Based Video Question Answering + + +
+ Existing efforts in text-based video question answering (TextVideoQA) are +criticized for their opaque decisionmaking and heavy reliance on scene-text +recognition. In this paper, we propose to study Grounded TextVideoQA by forcing +models to answer questions and spatio-temporally localize the relevant +scene-text regions, thus decoupling QA from scenetext recognition and promoting +research towards interpretable QA. The task has three-fold significance. First, +it encourages scene-text evidence versus other short-cuts for answer +predictions. Second, it directly accepts scene-text regions as visual answers, +thus circumventing the problem of ineffective answer evaluation by stringent +string matching. Third, it isolates the challenges inherited in VideoQA and +scene-text recognition. This enables the diagnosis of the root causes for +failure predictions, e.g., wrong QA or wrong scene-text recognition? To achieve +Grounded TextVideoQA, we propose the T2S-QA model that highlights a +disentangled temporal-to-spatial contrastive learning strategy for +weakly-supervised scene-text grounding and grounded TextVideoQA. To facilitate +evaluation, we construct a new dataset ViTXT-GQA which features 52K scene-text +bounding boxes within 2.2K temporal segments related to 2K questions and 729 +videos. With ViTXT-GQA, we perform extensive experiments and demonstrate the +severe limitations of existing techniques in Grounded TextVideoQA. While T2S-QA +achieves superior results, the large performance gap with human leaves ample +space for improvement. Our further analysis of oracle scene-text inputs posits +that the major challenge is scene-text recognition. To advance the research of +Grounded TextVideoQA, our dataset and code are at +\url{https://github.com/zhousheng97/ViTXT-GQA.git} + +
+
+
+
+
+ + ♻ ☆ Self-similarity Prior Distillation for Unsupervised Remote Physiological + Measurement + + +
+ Remote photoplethysmography (rPPG) is a noninvasive technique that aims to +capture subtle variations in facial pixels caused by changes in blood volume +resulting from cardiac activities. Most existing unsupervised methods for rPPG +tasks focus on the contrastive learning between samples while neglecting the +inherent self-similar prior in physiological signals. In this paper, we propose +a Self-Similarity Prior Distillation (SSPD) framework for unsupervised rPPG +estimation, which capitalizes on the intrinsic self-similarity of cardiac +activities. Specifically, we first introduce a physical-prior embedded +augmentation technique to mitigate the effect of various types of noise. Then, +we tailor a self-similarity-aware network to extract more reliable self-similar +physiological features. Finally, we develop a hierarchical self-distillation +paradigm to assist the network in disentangling self-similar physiological +patterns from facial videos. Comprehensive experiments demonstrate that the +unsupervised SSPD framework achieves comparable or even superior performance +compared to the state-of-the-art supervised methods. Meanwhile, SSPD maintains +the lowest inference time and computation cost among end-to-end models. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Revisiting BPR: A Replicability Study of a Common Recommender System + Baseline RecSys + '24 + + +
+ Bayesian Personalized Ranking (BPR), a collaborative filtering approach based +on matrix factorization, frequently serves as a benchmark for recommender +systems research. However, numerous studies often overlook the nuances of BPR +implementation, claiming that it performs worse than newly proposed methods +across various tasks. In this paper, we thoroughly examine the features of the +BPR model, indicating their impact on its performance, and investigate +open-source BPR implementations. Our analysis reveals inconsistencies between +these implementations and the original BPR paper, leading to a significant +decrease in performance of up to 50% for specific implementations. Furthermore, +through extensive experiments on real-world datasets under modern evaluation +settings, we demonstrate that with proper tuning of its hyperparameters, the +BPR model can achieve performance levels close to state-of-the-art methods on +the top-n recommendation tasks and even outperform them on specific datasets. +Specifically, on the Million Song Dataset, the BPR model with hyperparameters +tuning statistically significantly outperforms Mult-VAE by 10% in NDCG@100 with +binary relevance function. + +
+
+ comment: This paper is accepted at the Reproducibility track of the ACM RecSys + '24 conference +
+
+
+
+
+ + ☆ Knowledge in Triples for LLMs: Enhancing Table QA Accuracy with Semantic + Extraction + + +
+ Integrating structured knowledge from tabular formats poses significant +challenges within natural language processing (NLP), mainly when dealing with +complex, semi-structured tables like those found in the FeTaQA dataset. These +tables require advanced methods to interpret and generate meaningful responses +accurately. Traditional approaches, such as SQL and SPARQL, often fail to fully +capture the semantics of such data, especially in the presence of irregular +table structures like web tables. This paper addresses these challenges by +proposing a novel approach that extracts triples straightforward from tabular +data and integrates it with a retrieval-augmented generation (RAG) model to +enhance the accuracy, coherence, and contextual richness of responses generated +by a fine-tuned GPT-3.5-turbo-0125 model. Our approach significantly +outperforms existing baselines on the FeTaQA dataset, particularly excelling in +Sacre-BLEU and ROUGE metrics. It effectively generates contextually accurate +and detailed long-form answers from tables, showcasing its strength in complex +data interpretation. + +
+
+
+
+
+ + ☆ Data Generation via Latent Factor Simulation for Fairness-aware + Re-ranking + + +
+ Synthetic data is a useful resource for algorithmic research. It allows for +the evaluation of systems under a range of conditions that might be difficult +to achieve in real world settings. In recommender systems, the use of synthetic +data is somewhat limited; some work has concentrated on building user-item +interaction data at large scale. We believe that fairness-aware recommendation +research can benefit from simulated data as it allows the study of protected +groups and their interactions without depending on sensitive data that needs +privacy protection. In this paper, we propose a novel type of data for +fairness-aware recommendation: synthetic recommender system outputs that can be +used to study re-ranking algorithms. + +
+
+
+
+
+ + ☆ OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model + Hallucinations in Ontology Matching + + +
+ Hallucinations of large language models (LLMs) commonly occur in +domain-specific downstream tasks, with no exception in ontology matching (OM). +The prevalence of using LLMs for OM raises the need for benchmarks to better +understand LLM hallucinations. The OAEI-LLM dataset is an extended version of +the Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate +LLM-specific hallucinations in OM tasks. We outline the methodology used in +dataset construction and schema extension, and provide examples of potential +use cases. + +
+
+ comment: 4 pages, 1 figure +
+
+
+
+
+ + ☆ Cost-Effective Community-Hierarchy-Based Mutual Voting Approach for + Influence Maximization in Complex Networks + + +
+ Various types of promising techniques have come into being for influence +maximization whose aim is to identify influential nodes in complex networks. In +essence, real-world applications usually have high requirements on the balance +between time complexity and accuracy of influential nodes identification. To +address the challenges of imperfect node influence measurement and inefficient +seed nodes selection mechanism in such class of foregoing techniques, this +article proposes a novel approach called Cost-Effective +Community-Hierarchy-Based Mutual Voting for influence maximization in complex +networks. First, we develop a method for measuring the importance of different +nodes in networks based on an original concept of Dual-Scale +Community-Hierarchy Information that synthesizes both hierarchy structural +information and community structural information of nodes. The community +structural information contained in the nodes is measured by a new notion of +Hierarchical-Community Entropy. Second, we develop a method named +Cost-Effective Mutual-Influence-based Voting for seed nodes selection. +Hereinto, a low-computational-cost mutual voting mechanism and an updating +strategy called Lazy Score Updating Strategy are newly constructed for +optimizing the selecting of seed nodes. Third, we develop a balance index to +evaluate the performance of different methods in striking the tradeoff between +time complexity and the accuracy of influential nodes identification. Finally, +we demonstrate the approach performance over ten public datasets. The extensive +experiments show that the proposed approach outperforms 16 state-of-the-art +techniques on the balance between time complexity and accuracy of influential +nodes identification. Compared with the method with the second highest value of +the balance index, our approach can be improved by at most 9.29%. + +
+
+
+
+
+ + ♻ ☆ DRAGIN: Dynamic Retrieval Augmented Generation based on the Information + Needs of Large Language Models + + +
+ Dynamic retrieval augmented generation (RAG) paradigm actively decides when +and what to retrieve during the text generation process of Large Language +Models (LLMs). There are two key elements of this paradigm: identifying the +optimal moment to activate the retrieval module (deciding when to retrieve) and +crafting the appropriate query once retrieval is triggered (determining what to +retrieve). However, current dynamic RAG methods fall short in both aspects. +Firstly, the strategies for deciding when to retrieve often rely on static +rules. Moreover, the strategies for deciding what to retrieve typically limit +themselves to the LLM's most recent sentence or the last few tokens, while the +LLM's real-time information needs may span across the entire context. To +overcome these limitations, we introduce a new framework, DRAGIN, i.e., Dynamic +Retrieval Augmented Generation based on the real-time Information Needs of +LLMs. Our framework is specifically designed to make decisions on when and what +to retrieve based on the LLM's real-time information needs during the text +generation process. We evaluate DRAGIN along with existing methods +comprehensively over 4 knowledge-intensive generation datasets. Experimental +results show that DRAGIN achieves superior performance on all tasks, +demonstrating the effectiveness of our method. We have open-sourced all the +code, data, and models in GitHub: https://github.com/oneal2000/DRAGIN/tree/main + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ BRep Boundary and Junction Detection for CAD Reverse Engineering + + +
+ In machining process, 3D reverse engineering of the mechanical system is an +integral, highly important, and yet time consuming step to obtain parametric +CAD models from 3D scans. Therefore, deep learning-based Scan-to-CAD modeling +can offer designers enormous editability to quickly modify CAD model, being +able to parse all its structural compositions and design steps. In this paper, +we propose a supervised boundary representation (BRep) detection network +BRepDetNet from 3D scans of CC3D and ABC dataset. We have carefully annotated +the 50K and 45K scans of both the datasets with appropriate topological +relations (e.g., next, mate, previous) between the geometrical primitives +(i.e., boundaries, junctions, loops, faces) of their BRep data structures. The +proposed solution decomposes the Scan-to-CAD problem in Scan-to-BRep ensuring +the right step towards feature-based modeling, and therefore, leveraging other +existing BRep-to-CAD modeling methods. Our proposed Scan-to-BRep neural network +learns to detect BRep boundaries and junctions by minimizing focal-loss and +non-maximal suppression (NMS) during training time. Experimental results show +that our BRepDetNet with NMS-Loss achieves impressive results. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ CUS3D :CLIP-based Unsupervised 3D Segmentation via Object-level Denoise + + +
+ To ease the difficulty of acquiring annotation labels in 3D data, a common +method is using unsupervised and open-vocabulary semantic segmentation, which +leverage 2D CLIP semantic knowledge. In this paper, unlike previous research +that ignores the ``noise'' raised during feature projection from 2D to 3D, we +propose a novel distillation learning framework named CUS3D. In our approach, +an object-level denosing projection module is designed to screen out the +``noise'' and ensure more accurate 3D feature. Based on the obtained features, +a multimodal distillation learning module is designed to align the 3D feature +with CLIP semantic feature space with object-centered constrains to achieve +advanced unsupervised semantic segmentation. We conduct comprehensive +experiments in both unsupervised and open-vocabulary segmentation, and the +results consistently showcase the superiority of our model in achieving +advanced unsupervised segmentation results and its effectiveness in +open-vocabulary segmentation. + +
+
+ comment: 6 pages,3 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 17 + +
+
+
+ + ☆ Causal Feature Selection Method for Contextual Multi-Armed Bandits in + Recommender System + + +
+ Features (a.k.a. context) are critical for contextual multi-armed bandits +(MAB) performance. In practice of large scale online system, it is important to +select and implement important features for the model: missing important +features can led to sub-optimal reward outcome, and including irrelevant +features can cause overfitting, poor model interpretability, and implementation +cost. However, feature selection methods for conventional machine learning +models fail short for contextual MAB use cases, as conventional methods select +features correlated with the outcome variable, but not necessarily causing +heterogeneuous treatment effect among arms which are truely important for +contextual MAB. In this paper, we introduce model-free feature selection +methods designed for contexutal MAB problem, based on heterogeneous causal +effect contributed by the feature to the reward distribution. Empirical +evaluation is conducted based on synthetic data as well as real data from an +online experiment for optimizing content cover image in a recommender system. +The results show this feature selection method effectively selects the +important features that lead to higher contextual MAB reward than unimportant +features. Compared with model embedded method, this model-free method has +advantage of fast computation speed, ease of implementation, and prune of model +mis-specification issues. + +
+
+
+
+
+ + ☆ Segment Discovery: Enhancing E-commerce Targeting RecSys'24 + + +
+ Modern e-commerce services frequently target customers with incentives or +interventions to engage them in their products such as games, shopping, video +streaming, etc. This customer engagement increases acquisition of more +customers and retention of existing ones, leading to more business for the +company while improving customer experience. Often, customers are either +randomly targeted or targeted based on the propensity of desirable behavior. +However, such policies can be suboptimal as they do not target the set of +customers who would benefit the most from the intervention and they may also +not take account of any constraints. In this paper, we propose a policy +framework based on uplift modeling and constrained optimization that identifies +customers to target for a use-case specific intervention so as to maximize the +value to the business, while taking account of any given constraints. We +demonstrate improvement over state-of-the-art targeting approaches using two +large-scale experimental studies and a production implementation. + +
+
+ comment: Accepted at the CONSEQUENCES'24 workshop, co-located with ACM + RecSys'24 +
+
+
+
+
+ + ☆ Beauty Beyond Words: Explainable Beauty Product Recommendations Using + Ingredient-Based Product Attributes + + +
+ Accurate attribute extraction is critical for beauty product recommendations +and building trust with customers. This remains an open problem, as existing +solutions are often unreliable and incomplete. We present a system to extract +beauty-specific attributes using end-to-end supervised learning based on beauty +product ingredients. A key insight to our system is a novel energy-based +implicit model architecture. We show that this implicit model architecture +offers significant benefits in terms of accuracy, explainability, robustness, +and flexibility. Furthermore, our implicit model can be easily fine-tuned to +incorporate additional attributes as they become available, making it more +useful in real-world applications. We validate our model on a major e-commerce +skincare product catalog dataset and demonstrate its effectiveness. Finally, we +showcase how ingredient-based attribute extraction contributes to enhancing the +explainability of beauty recommendations. + +
+
+ comment: 18th ACM Conference on Recommender Systems, Workshop on Strategic and + Utility-aware REcommendation +
+
+
+
+
+ + ☆ Advancing Event Causality Identification via Heuristic Semantic + Dependency Inquiry Network + + +
+ Event Causality Identification (ECI) focuses on extracting causal relations +between events in texts. Existing methods for ECI primarily rely on causal +features and external knowledge. However, these approaches fall short in two +dimensions: (1) causal features between events in a text often lack explicit +clues, and (2) external knowledge may introduce bias, while specific problems +require tailored analyses. To address these issues, we propose SemDI - a simple +and effective Semantic Dependency Inquiry Network for ECI. SemDI captures +semantic dependencies within the context using a unified encoder. Then, it +utilizes a Cloze Analyzer to generate a fill-in token based on comprehensive +context understanding. Finally, this fill-in token is used to inquire about the +causal relation between two events. Extensive experiments demonstrate the +effectiveness of SemDI, surpassing state-of-the-art methods on three widely +used benchmarks. Code is available at https://github.com/hrlics/SemDI. + +
+
+
+
+
+ + ☆ Data Augmentation for Sequential Recommendation: A Survey + + +
+ As an essential branch of recommender systems, sequential recommendation (SR) +has received much attention due to its well-consistency with real-world +situations. However, the widespread data sparsity issue limits the SR model's +performance. Therefore, researchers have proposed many data augmentation (DA) +methods to mitigate this phenomenon and have achieved impressive progress. In +this survey, we provide a comprehensive review of DA methods for SR. We start +by introducing the research background and motivation. Then, we categorize +existing methodologies regarding their augmentation principles, objects, and +purposes. Next, we present a comparative discussion of their advantages and +disadvantages, followed by the exhibition and analysis of representative +experimental results. Finally, we outline directions for future research and +summarize this survey. We also maintain a repository with a paper list at +\url{https://github.com/KingGugu/DA-CL-4Rec}. + +
+
+
+
+
+ + ☆ A Multimodal Dense Retrieval Approach for Speech-Based Open-Domain + Question Answering + + +
+ Speech-based open-domain question answering (QA over a large corpus of text +passages with spoken questions) has emerged as an important task due to the +increasing number of users interacting with QA systems via speech interfaces. +Passage retrieval is a key task in speech-based open-domain QA. So far, +previous works adopted pipelines consisting of an automatic speech recognition +(ASR) model that transcribes the spoken question before feeding it to a dense +text retriever. Such pipelines have several limitations. The need for an ASR +model limits the applicability to low-resource languages and specialized +domains with no annotated speech data. Furthermore, the ASR model propagates +its errors to the retriever. In this work, we try to alleviate these +limitations by proposing an ASR-free, end-to-end trained multimodal dense +retriever that can work directly on spoken questions. Our experimental results +showed that, on shorter questions, our retriever is a promising alternative to +the \textit{ASR and Retriever} pipeline, achieving better retrieval performance +in cases where ASR would have mistranscribed important words in the question or +have produced a transcription with a high word error rate. + +
+
+
+
+
+ + ☆ Procedure Model for Building Knowledge Graphs for Industry Applications + + +
+ Enterprise knowledge graphs combine business data and organizational +knowledge by means of a semantic network of concepts, properties, individuals +and relationships. The graph-based integration of previously unconnected +information with domain knowledge provides new insights and enables intelligent +business applications. However, knowledge graph construction is a large +investment which requires a joint effort of domain and technical experts. This +paper presents a practical step-by-step procedure model for building an RDF +knowledge graph that interconnects heterogeneous data and expert knowledge for +an industry use case. The self-contained process adapts the "Cross Industry +Standard Process for Data Mining" and uses competency questions throughout the +entire development cycle. The procedure model starts with business and data +understanding, describes tasks for ontology modeling and the graph setup, and +ends with process steps for evaluation and deployment. + +
+
+
+
+
+ + ☆ Contextual Compression in Retrieval-Augmented Generation for Large + Language Models: A Survey + + +
+ Large Language Models (LLMs) showcase remarkable abilities, yet they struggle +with limitations such as hallucinations, outdated knowledge, opacity, and +inexplicable reasoning. To address these challenges, Retrieval-Augmented +Generation (RAG) has proven to be a viable solution, leveraging external +databases to improve the consistency and coherence of generated content, +especially valuable for complex, knowledge-rich tasks, and facilitates +continuous improvement by leveraging domain-specific insights. By combining the +intrinsic knowledge of LLMs with the vast, dynamic repositories of external +databases, RAG achieves a synergistic effect. However, RAG is not without its +limitations, including a limited context window, irrelevant information, and +the high processing overhead for extensive contextual data. In this +comprehensive work, we explore the evolution of Contextual Compression +paradigms, providing an in-depth examination of the field. Finally, we outline +the current challenges and suggest potential research and development +directions, paving the way for future advancements in this area. + +
+
+ comment: Ongoing Work +
+
+
+
+
+ + ☆ More Clustering Quality Metrics for ABCDE + + +
+ ABCDE is a technique for evaluating clusterings of very large populations of +items. Given two clusterings, namely a Baseline clustering and an Experiment +clustering, ABCDE can characterize their differences with impact and quality +metrics, and thus help to determine which clustering to prefer. We previously +described the basic quality metrics of ABCDE, namely the GoodSplitRate, +BadSplitRate, GoodMergeRate, BadMergeRate and DeltaPrecision, and how to +estimate them on the basis of human judgements. This paper extends that +treatment with more quality metrics. It describes a technique that aims to +characterize the DeltaRecall of the clustering change. It introduces a new +metric, called IQ, to characterize the degree to which the clustering diff +translates into an improvement in the quality. Ideally, a large diff would +improve the quality by a large amount. Finally, this paper mentions ways to +characterize the absolute Precision and Recall of a single clustering with +ABCDE. + +
+
+
+
+
+ + ☆ A Unified Causal Framework for Auditing Recommender Systems for Ethical + Concerns + + +
+ As recommender systems become widely deployed in different domains, they +increasingly influence their users' beliefs and preferences. Auditing +recommender systems is crucial as it not only ensures the continuous +improvement of recommendation algorithms but also safeguards against potential +issues like biases and ethical concerns. In this paper, we view recommender +system auditing from a causal lens and provide a general recipe for defining +auditing metrics. Under this general causal auditing framework, we categorize +existing auditing metrics and identify gaps in them -- notably, the lack of +metrics for auditing user agency while accounting for the multi-step dynamics +of the recommendation process. We leverage our framework and propose two +classes of such metrics:future- and past-reacheability and stability, that +measure the ability of a user to influence their own and other users' +recommendations, respectively. We provide both a gradient-based and a black-box +approach for computing these metrics, allowing the auditor to compute them +under different levels of access to the recommender system. In our experiments, +we demonstrate the efficacy of methods for computing the proposed metrics and +inspect the design of recommender systems through these proposed metrics. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ RPAF: A Reinforcement Prediction-Allocation Framework for Cache + Allocation in Large-Scale Recommender Systems + + +
+ Modern recommender systems are built upon computation-intensive +infrastructure, and it is challenging to perform real-time computation for each +request, especially in peak periods, due to the limited computational +resources. Recommending by user-wise result caches is widely used when the +system cannot afford a real-time recommendation. However, it is challenging to +allocate real-time and cached recommendations to maximize the users' overall +engagement. This paper shows two key challenges to cache allocation, i.e., the +value-strategy dependency and the streaming allocation. Then, we propose a +reinforcement prediction-allocation framework (RPAF) to address these issues. +RPAF is a reinforcement-learning-based two-stage framework containing +prediction and allocation stages. The prediction stage estimates the values of +the cache choices considering the value-strategy dependency, and the allocation +stage determines the cache choices for each individual request while satisfying +the global budget constraint. We show that the challenge of training RPAF +includes globality and the strictness of budget constraints, and a relaxed +local allocator (RLA) is proposed to address this issue. Moreover, a PoolRank +algorithm is used in the allocation stage to deal with the streaming allocation +problem. Experiments show that RPAF significantly improves users' engagement +under computational budget constraints. + +
+
+
+
+
+ + ♻ ☆ Large language models in biomedical natural language processing: + benchmarks, baselines, and recommendations + + +
+ Biomedical literature is growing rapidly, making it challenging to curate and +extract knowledge manually. Biomedical natural language processing (BioNLP) +techniques that can automatically extract information from biomedical +literature help alleviate this burden. Recently, large Language Models (LLMs), +such as GPT-3 and GPT-4, have gained significant attention for their impressive +performance. However, their effectiveness in BioNLP tasks and impact on method +development and downstream users remain understudied. This pilot study (1) +establishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and +one-shot settings in eight BioNLP datasets across four applications: named +entity recognition, relation extraction, multi-label document classification, +and semantic similarity and reasoning, (2) examines the errors produced by the +LLMs and categorized the errors into three types: missingness, inconsistencies, +and unwanted artificial content, and (3) provides suggestions for using LLMs in +BioNLP applications. We make the datasets, baselines, and results publicly +available to the community via +https://github.com/qingyu-qc/gpt_bionlp_benchmark. + +
+
+
+
+
+ + ♻ ☆ Train Once, Use Flexibly: A Modular Framework for Multi-Aspect Neural + News Recommendation EMNLP 2024 + + +
+ Recent neural news recommenders (NNRs) extend content-based recommendation +(1) by aligning additional aspects (e.g., topic, sentiment) between candidate +news and user history or (2) by diversifying recommendations w.r.t. these +aspects. This customization is achieved by ``hardcoding`` additional +constraints into the NNR's architecture and/or training objectives: any change +in the desired recommendation behavior thus requires retraining the model with +a modified objective. This impedes widespread adoption of multi-aspect news +recommenders. In this work, we introduce MANNeR, a modular framework for +multi-aspect neural news recommendation that supports on-the-fly customization +over individual aspects at inference time. With metric-based learning as its +backbone, MANNeR learns aspect-specialized news encoders and then flexibly and +linearly combines the resulting aspect-specific similarity scores into +different ranking functions, alleviating the need for ranking function-specific +retraining of the model. Extensive experimental results show that MANNeR +consistently outperforms state-of-the-art NNRs on both standard content-based +recommendation and single- and multi-aspect customization. Lastly, we validate +that MANNeR's aspect-customization module is robust to language and domain +transfer. + +
+
+ comment: Accepted at the 2024 Conference on Empirical Methods in Natural + Language Processing (EMNLP 2024) +
+
+
+
+
+ + ♻ ☆ TISIS : Trajectory Indexing for SImilarity Search + + +
+ Social media platforms enable users to share diverse types of information, +including geolocation data that captures their movement patterns. Such +geolocation data can be leveraged to reconstruct the trajectory of a user's +visited Points of Interest (POIs). A key requirement in numerous applications +is the ability to measure the similarity between such trajectories, as this +facilitates the retrieval of trajectories that are similar to a given reference +trajectory. This is the main focus of our work. Existing methods predominantly +rely on applying a similarity function to each candidate trajectory to identify +those that are sufficiently similar. However, this approach becomes +computationally expensive when dealing with large-scale datasets. To mitigate +this challenge, we propose TISIS, an efficient method that uses trajectory +indexing to quickly find similar trajectories that share common POIs in the +same order. Furthermore, to account for scenarios where POIs in trajectories +may not exactly match but are contextually similar, we introduce TISIS*, a +variant of TISIS that incorporates POI embeddings. This extension allows for +more comprehensive retrieval of similar trajectories by considering semantic +similarities between POIs, beyond mere exact matches. Extensive experimental +evaluations demonstrate that the proposed approach significantly outperforms a +baseline method based on the well-known Longest Common SubSequence (LCSS) +algorithm, yielding substantial performance improvements across various +real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Visualising Personal Data Flows: Insights from a Case Study of + Booking.com + + +
+ Commercial organisations are holding and processing an ever-increasing amount +of personal data. Policies and laws are continually changing to require these +companies to be more transparent regarding the collection, storage, processing +and sharing of this data. This paper reports our work of taking Booking.com as +a case study to visualise personal data flows extracted from their privacy +policy. By showcasing how the company shares its consumers' personal data, we +raise questions and extend discussions on the challenges and limitations of +using privacy policies to inform online users about the true scale and the +landscape of personal data flows. This case study can inform us about future +research on more data flow-oriented privacy policy analysis and on the +construction of a more comprehensive ontology on personal data flows in +complicated business ecosystems. + +
+
+ comment: This is the full edition of a paper published in Intelligent + Information Systems: CAiSE Forum 2023, Zaragoza, Spain, June 12-16, 2023, + Proceedings, Lecture Notes in Business Information Processing (LNBIP), Volume + 477, pp. 52-60, 2023, Springer Nature, + https://link.springer.com/book/10.1007/978-3-031-34674-3_7 +
+
+
+
+
+ + ♻ ☆ The Relevance of Item-Co-Exposure For Exposure Bias Mitigation RecSys + '24 + + +
+ Through exposing items to users, implicit feedback recommender systems +influence the logged interactions, and, ultimately, their own recommendations. +This effect is called exposure bias and it can lead to issues such as filter +bubbles and echo chambers. Previous research employed the multinomial logit +model (MNL) with exposure information to reduce exposure bias on synthetic +data. + This extended abstract summarizes our previous study in which we investigated +whether (i) these findings hold for human-generated choices, (ii) other +discrete choice models mitigate bias better, and (iii) an item's estimated +relevance can depend on the relevances of the other items that were presented +with it. We collected a data set of biased and unbiased choices in a controlled +online user study and measured the effects of overexposure and competition. + We found that (i) the discrete choice models effectively mitigated exposure +bias on human-generated choice data, (ii) there were no significant differences +in robustness among the different discrete choice models, and (iii) only +multivariate discrete choice models were robust to competition between items. +We conclude that discrete choice models mitigate exposure bias effectively +because they consider item-co-exposure. Moreover, exposing items alongside more +or less popular items can bias future recommendations significantly and item +exposure must be tracked for overcoming exposure bias. We consider our work +vital for understanding what exposure bias it, how it forms, and how it can be +mitigated. + +
+
+ comment: Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys + '24 +
+
+
+
+
+ + ♻ ☆ Exploring Information Retrieval Landscapes: An Investigation of a Novel + Evaluation Techniques and Comparative Document Splitting Methods + + +
+ The performance of Retrieval-Augmented Generation (RAG) systems in +information retrieval is significantly influenced by the characteristics of the +documents being processed. In this study, the structured nature of textbooks, +the conciseness of articles, and the narrative complexity of novels are shown +to require distinct retrieval strategies. A comparative evaluation of multiple +document-splitting methods reveals that the Recursive Character Splitter +outperforms the Token-based Splitter in preserving contextual integrity. A +novel evaluation technique is introduced, utilizing an open-source model to +generate a comprehensive dataset of question-and-answer pairs, simulating +realistic retrieval scenarios to enhance testing efficiency and metric +reliability. The evaluation employs weighted scoring metrics, including +SequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy +and relevance. This approach establishes a refined standard for evaluating the +precision of RAG systems, with future research focusing on optimizing chunk and +overlap sizes to improve retrieval accuracy and efficiency. + +
+
+ comment: This article is 16 pages long and includes detailed comparisons of + RAG systems and document splitting techniques +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Temporally Aligned Audio for Video with Autoregression ICASSP 2025 + + +
+ We introduce V-AURA, the first autoregressive model to achieve high temporal +alignment and relevance in video-to-audio generation. V-AURA uses a +high-framerate visual feature extractor and a cross-modal audio-visual feature +fusion strategy to capture fine-grained visual motion events and ensure precise +temporal alignment. Additionally, we propose VisualSound, a benchmark dataset +with high audio-visual relevance. VisualSound is based on VGGSound, a video +dataset consisting of in-the-wild samples extracted from YouTube. During the +curation, we remove samples where auditory events are not aligned with the +visual ones. V-AURA outperforms current state-of-the-art models in temporal +alignment and semantic relevance while maintaining comparable audio quality. +Code, samples, VisualSound and models are available at +https://v-aura.notion.site + +
+
+ comment: Submitted to ICASSP 2025. Project page https://v-aura.notion.site +
+
+
+
+
+ + ☆ ChemDFM-X: Towards Large Multimodal Model for Chemistry + + +
+ Rapid developments of AI tools are expected to offer unprecedented assistance +to the research of natural science including chemistry. However, neither +existing unimodal task-specific specialist models nor emerging general large +multimodal models (LMM) can cover the wide range of chemical data modality and +task categories. To address the real demands of chemists, a cross-modal +Chemical General Intelligence (CGI) system, which serves as a truly practical +and useful research assistant utilizing the great potential of LMMs, is in +great need. In this work, we introduce the first Cross-modal Dialogue +Foundation Model for Chemistry (ChemDFM-X). Diverse multimodal data are +generated from an initial modality by approximate calculations and +task-specific model predictions. This strategy creates sufficient chemical +training corpora, while significantly reducing excessive expense, resulting in +an instruction-tuning dataset containing 7.6M data. After instruction +finetuning, ChemDFM-X is evaluated on extensive experiments of different +chemical tasks with various data modalities. The results demonstrate the +capacity of ChemDFM-X for multimodal and inter-modal knowledge comprehension. +ChemDFM-X marks a significant milestone toward aligning all modalities in +chemistry, a step closer to CGI. + +
+
+ comment: 19 pages, 7 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ High Perceptual Quality Wireless Image Delivery with Denoising Diffusion + Models + + +
+ We consider the image transmission problem over a noisy wireless channel via +deep learning-based joint source-channel coding (DeepJSCC) along with a +denoising diffusion probabilistic model (DDPM) at the receiver. Specifically, +we are interested in the perception-distortion trade-off in the practical +finite block length regime, in which separate source and channel coding can be +highly suboptimal. We introduce a novel scheme, where the conventional DeepJSCC +encoder targets transmitting a lower resolution version of the image, which +later can be refined thanks to the generative model available at the receiver. +In particular, we utilize the range-null space decomposition of the target +image; DeepJSCC transmits the range-space of the image, while DDPM +progressively refines its null space contents. Through extensive experiments, +we demonstrate significant improvements in distortion and perceptual quality of +reconstructed images compared to standard DeepJSCC and the state-of-the-art +generative learning-based method. + +
+
+ comment: 6 pages, 5 figures. Published at INFOCOM 2024 Workshops +
+
+
+
+
+ + ♻ ☆ Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation + + +
+ Modern displays can render video content with high dynamic range (HDR) and +wide color gamut (WCG). However, most resources are still in standard dynamic +range (SDR). Therefore, transforming existing SDR content into the HDRTV +standard holds significant value. This paper defines and analyzes the +SDRTV-to-HDRTV task by modeling the formation of SDRTV/HDRTV content. Our +findings reveal that a naive endto-end supervised training approach suffers +from severe gamut transition errors. To address this, we propose a new +three-step solution called HDRTVNet++, which includes adaptive global color +mapping, local enhancement, and highlight refinement. The adaptive global color +mapping step utilizes global statistics for image-adaptive color adjustments. A +local enhancement network further enhances details, and the two sub-networks +are combined as a generator to achieve highlight consistency through GANbased +joint training. Designed for ultra-high-definition TV content, our method is +both effective and lightweight for processing 4K resolution images. We also +constructed a dataset using HDR videos in the HDR10 standard, named HDRTV1K, +containing 1235 training and 117 testing images, all in 4K resolution. +Additionally, we employ five metrics to evaluate SDRTV-to-HDRTV performance. +Our results demonstrate state-of-the-art performance both quantitatively and +visually. The codes and models are available at +https://github.com/xiaom233/HDRTVNet-plus. + +
+
+ comment: Extended version of HDRTVNet +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`